diff options
author | Sagi Grimberg <sagig@mellanox.com> | 2014-10-01 07:02:01 -0400 |
---|---|---|
committer | Roland Dreier <roland@purestorage.com> | 2014-10-09 03:06:06 -0400 |
commit | c47a3c9ed5be167f49a6fd3f696dac03536282eb (patch) | |
tree | e8d8ecdf2ddb5d7f661fa87c8172e042d6ebcebc | |
parent | 96f15198c1457df29b51ed151b1e5b2a223d1346 (diff) |
IB/iser: Fix DEVICE REMOVAL handling in the absence of iscsi daemon
iscsi daemon is in user-space, thus we can't rely on it to be invoked
at connection teardown (if not running or does not receive CPU time).
This patch addresses the issue by re-structuring iSER connection
teardown logic and CM events handling.
The CM events will dictate the RDMA resources destruction (ib_conn)
and iser_conn is kept around as long as iscsi_conn is left around
allowing iscsi/iser callbacks to continue after RDMA transport was
destroyed.
This patch introduces a separation in logic when handling CM events:
- DISCONNECTED_HANDLER, ADDR_CHANGED
This events indicate the start of teardown process.
Actions:
1. Terminate the connection: rdma_disconnect (send DREQ/DREP)
2. Notify iSCSI of connection failure
3. Change state to TERMINATING
4. Poll for all flush errors to be consumed
- TIMEWAIT_EXIT, DEVICE_REMOVAL
These events indicate the final stage of termination process and
we can free RDMA related resources.
Actions:
1. Call disconnected handler (we are not guaranteed that DISCONNECTED
event was invoked in the past)
2. Cleanup RDMA related resources
3. For DEVICE_REMOVAL return non-zero rc from cma_handler to
implicitly destroy the cm_id (Can't rely on user-space, make sure
we have forward progress)
We replace flush_completion (indicate all flushes were consumed) with
ib_completion (rdma resources were cleaned up).
The iser_conn_release_work will wait for teardown completions:
- conn_stop was completed (tasks were cleaned-up) - stop_completion
- RDMA resources were destroyed - ib_completion
And then will continue to free iser connection representation (iser_conn).
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Ariel Nahum <arieln@mellanox.com>
Signed-off-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r-- | drivers/infiniband/ulp/iser/iscsi_iser.h | 6 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_verbs.c | 163 |
2 files changed, 108 insertions, 61 deletions
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index ec238b3bd278..95c484d0f881 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h | |||
@@ -370,9 +370,9 @@ struct iser_conn { | |||
370 | unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ | 370 | unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ |
371 | char name[ISER_OBJECT_NAME_SIZE]; | 371 | char name[ISER_OBJECT_NAME_SIZE]; |
372 | struct work_struct release_work; | 372 | struct work_struct release_work; |
373 | struct completion stop_completion; | ||
374 | struct mutex state_mutex; | 373 | struct mutex state_mutex; |
375 | struct completion flush_completion; | 374 | struct completion stop_completion; |
375 | struct completion ib_completion; | ||
376 | struct completion up_completion; | 376 | struct completion up_completion; |
377 | struct list_head conn_list; /* entry in ig conn list */ | 377 | struct list_head conn_list; /* entry in ig conn list */ |
378 | 378 | ||
@@ -442,7 +442,7 @@ void iser_conn_init(struct iser_conn *iser_conn); | |||
442 | 442 | ||
443 | void iser_conn_release(struct iser_conn *iser_conn); | 443 | void iser_conn_release(struct iser_conn *iser_conn); |
444 | 444 | ||
445 | void iser_conn_terminate(struct iser_conn *iser_conn); | 445 | int iser_conn_terminate(struct iser_conn *iser_conn); |
446 | 446 | ||
447 | void iser_release_work(struct work_struct *work); | 447 | void iser_release_work(struct work_struct *work); |
448 | 448 | ||
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index e4299743c459..6170d06a8acc 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c | |||
@@ -44,6 +44,7 @@ | |||
44 | 44 | ||
45 | static void iser_cq_tasklet_fn(unsigned long data); | 45 | static void iser_cq_tasklet_fn(unsigned long data); |
46 | static void iser_cq_callback(struct ib_cq *cq, void *cq_context); | 46 | static void iser_cq_callback(struct ib_cq *cq, void *cq_context); |
47 | static int iser_drain_tx_cq(struct iser_device *device, int cq_index); | ||
47 | 48 | ||
48 | static void iser_cq_event_callback(struct ib_event *cause, void *context) | 49 | static void iser_cq_event_callback(struct ib_event *cause, void *context) |
49 | { | 50 | { |
@@ -573,11 +574,10 @@ void iser_release_work(struct work_struct *work) | |||
573 | rc = wait_for_completion_timeout(&iser_conn->stop_completion, 30 * HZ); | 574 | rc = wait_for_completion_timeout(&iser_conn->stop_completion, 30 * HZ); |
574 | WARN_ON(rc == 0); | 575 | WARN_ON(rc == 0); |
575 | 576 | ||
576 | /* wait for the qp`s post send and post receive buffers to empty */ | 577 | rc = wait_for_completion_timeout(&iser_conn->ib_completion, 30 * HZ); |
577 | rc = wait_for_completion_timeout(&iser_conn->flush_completion, 30 * HZ); | 578 | if (rc == 0) |
578 | WARN_ON(rc == 0); | 579 | iser_warn("conn %p, IB cleanup didn't complete in 30 " |
579 | 580 | "seconds, continue with release\n", iser_conn); | |
580 | iser_conn->state = ISER_CONN_DOWN; | ||
581 | 581 | ||
582 | mutex_lock(&iser_conn->state_mutex); | 582 | mutex_lock(&iser_conn->state_mutex); |
583 | iser_conn->state = ISER_CONN_DOWN; | 583 | iser_conn->state = ISER_CONN_DOWN; |
@@ -589,12 +589,16 @@ void iser_release_work(struct work_struct *work) | |||
589 | /** | 589 | /** |
590 | * iser_free_ib_conn_res - release IB related resources | 590 | * iser_free_ib_conn_res - release IB related resources |
591 | * @iser_conn: iser connection struct | 591 | * @iser_conn: iser connection struct |
592 | * @destroy_device: indicator if we need to try to release | ||
593 | * the iser device (only iscsi shutdown and DEVICE_REMOVAL | ||
594 | * will use this. | ||
592 | * | 595 | * |
593 | * This routine is called with the iser state mutex held | 596 | * This routine is called with the iser state mutex held |
594 | * so the cm_id removal is out of here. It is Safe to | 597 | * so the cm_id removal is out of here. It is Safe to |
595 | * be invoked multiple times. | 598 | * be invoked multiple times. |
596 | */ | 599 | */ |
597 | static void iser_free_ib_conn_res(struct iser_conn *iser_conn) | 600 | static void iser_free_ib_conn_res(struct iser_conn *iser_conn, |
601 | bool destroy_device) | ||
598 | { | 602 | { |
599 | struct ib_conn *ib_conn = &iser_conn->ib_conn; | 603 | struct ib_conn *ib_conn = &iser_conn->ib_conn; |
600 | struct iser_device *device = ib_conn->device; | 604 | struct iser_device *device = ib_conn->device; |
@@ -610,7 +614,7 @@ static void iser_free_ib_conn_res(struct iser_conn *iser_conn) | |||
610 | ib_conn->qp = NULL; | 614 | ib_conn->qp = NULL; |
611 | } | 615 | } |
612 | 616 | ||
613 | if (device != NULL) { | 617 | if (destroy_device && device != NULL) { |
614 | iser_device_try_release(device); | 618 | iser_device_try_release(device); |
615 | ib_conn->device = NULL; | 619 | ib_conn->device = NULL; |
616 | } | 620 | } |
@@ -629,7 +633,11 @@ void iser_conn_release(struct iser_conn *iser_conn) | |||
629 | 633 | ||
630 | mutex_lock(&iser_conn->state_mutex); | 634 | mutex_lock(&iser_conn->state_mutex); |
631 | BUG_ON(iser_conn->state != ISER_CONN_DOWN); | 635 | BUG_ON(iser_conn->state != ISER_CONN_DOWN); |
632 | iser_free_ib_conn_res(iser_conn); | 636 | /* |
637 | * In case we never got to bind stage, we still need to | ||
638 | * release IB resources (which is safe to call more than once). | ||
639 | */ | ||
640 | iser_free_ib_conn_res(iser_conn, true); | ||
633 | mutex_unlock(&iser_conn->state_mutex); | 641 | mutex_unlock(&iser_conn->state_mutex); |
634 | 642 | ||
635 | if (ib_conn->cma_id != NULL) { | 643 | if (ib_conn->cma_id != NULL) { |
@@ -641,23 +649,68 @@ void iser_conn_release(struct iser_conn *iser_conn) | |||
641 | } | 649 | } |
642 | 650 | ||
643 | /** | 651 | /** |
652 | * iser_poll_for_flush_errors - Don't settle for less than all. | ||
653 | * @struct ib_conn: IB context of the connection | ||
654 | * | ||
655 | * This routine is called when the QP is in error state | ||
656 | * It polls the send CQ until all flush errors are consumed and | ||
657 | * returns when all flush errors were processed. | ||
658 | */ | ||
659 | static void iser_poll_for_flush_errors(struct ib_conn *ib_conn) | ||
660 | { | ||
661 | struct iser_device *device = ib_conn->device; | ||
662 | int count = 0; | ||
663 | |||
664 | while (ib_conn->post_recv_buf_count > 0 || | ||
665 | atomic_read(&ib_conn->post_send_buf_count) > 0) { | ||
666 | msleep(100); | ||
667 | if (atomic_read(&ib_conn->post_send_buf_count) > 0) | ||
668 | iser_drain_tx_cq(device, ib_conn->cq_index); | ||
669 | |||
670 | count++; | ||
671 | /* Don't flood with prints */ | ||
672 | if (count % 30 == 0) | ||
673 | iser_dbg("post_recv %d post_send %d", | ||
674 | ib_conn->post_recv_buf_count, | ||
675 | atomic_read(&ib_conn->post_send_buf_count)); | ||
676 | } | ||
677 | } | ||
678 | |||
679 | /** | ||
644 | * triggers start of the disconnect procedures and wait for them to be done | 680 | * triggers start of the disconnect procedures and wait for them to be done |
681 | * Called with state mutex held | ||
645 | */ | 682 | */ |
646 | void iser_conn_terminate(struct iser_conn *iser_conn) | 683 | int iser_conn_terminate(struct iser_conn *iser_conn) |
647 | { | 684 | { |
648 | struct ib_conn *ib_conn = &iser_conn->ib_conn; | 685 | struct ib_conn *ib_conn = &iser_conn->ib_conn; |
649 | int err = 0; | 686 | int err = 0; |
650 | 687 | ||
651 | /* change the ib conn state only if the conn is UP, however always call | 688 | /* terminate the iser conn only if the conn state is UP */ |
652 | * rdma_disconnect since this is the only way to cause the CMA to change | 689 | if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, |
653 | * the QP state to ERROR | 690 | ISER_CONN_TERMINATING)) |
691 | return 0; | ||
692 | |||
693 | iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state); | ||
694 | |||
695 | /* suspend queuing of new iscsi commands */ | ||
696 | if (iser_conn->iscsi_conn) | ||
697 | iscsi_suspend_queue(iser_conn->iscsi_conn); | ||
698 | |||
699 | /* | ||
700 | * In case we didn't already clean up the cma_id (peer initiated | ||
701 | * a disconnection), we need to Cause the CMA to change the QP | ||
702 | * state to ERROR. | ||
654 | */ | 703 | */ |
704 | if (ib_conn->cma_id) { | ||
705 | err = rdma_disconnect(ib_conn->cma_id); | ||
706 | if (err) | ||
707 | iser_err("Failed to disconnect, conn: 0x%p err %d\n", | ||
708 | iser_conn, err); | ||
709 | |||
710 | iser_poll_for_flush_errors(ib_conn); | ||
711 | } | ||
655 | 712 | ||
656 | iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); | 713 | return 1; |
657 | err = rdma_disconnect(ib_conn->cma_id); | ||
658 | if (err) | ||
659 | iser_err("Failed to disconnect, conn: 0x%p err %d\n", | ||
660 | iser_conn, err); | ||
661 | } | 714 | } |
662 | 715 | ||
663 | /** | 716 | /** |
@@ -780,34 +833,36 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) | |||
780 | 833 | ||
781 | static void iser_disconnected_handler(struct rdma_cm_id *cma_id) | 834 | static void iser_disconnected_handler(struct rdma_cm_id *cma_id) |
782 | { | 835 | { |
783 | struct iser_conn *iser_conn; | 836 | struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; |
784 | struct ib_conn *ib_conn = &iser_conn->ib_conn; | ||
785 | |||
786 | iser_conn = (struct iser_conn *)cma_id->context; | ||
787 | 837 | ||
788 | /* getting here when the state is UP means that the conn is being * | 838 | if (iser_conn_terminate(iser_conn)) { |
789 | * terminated asynchronously from the iSCSI layer's perspective. */ | ||
790 | if (iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, | ||
791 | ISER_CONN_TERMINATING)){ | ||
792 | if (iser_conn->iscsi_conn) | 839 | if (iser_conn->iscsi_conn) |
793 | iscsi_conn_failure(iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); | 840 | iscsi_conn_failure(iser_conn->iscsi_conn, |
841 | ISCSI_ERR_CONN_FAILED); | ||
794 | else | 842 | else |
795 | iser_err("iscsi_iser connection isn't bound\n"); | 843 | iser_err("iscsi_iser connection isn't bound\n"); |
796 | } | 844 | } |
845 | } | ||
846 | |||
847 | static void iser_cleanup_handler(struct rdma_cm_id *cma_id, | ||
848 | bool destroy_device) | ||
849 | { | ||
850 | struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; | ||
797 | 851 | ||
798 | /* Complete the termination process if no posts are pending. This code | 852 | /* |
799 | * block also exists in iser_handle_comp_error(), but it is needed here | 853 | * We are not guaranteed that we visited disconnected_handler |
800 | * for cases of no flushes at all, e.g. discovery over rdma. | 854 | * by now, call it here to be safe that we handle CM drep |
855 | * and flush errors. | ||
801 | */ | 856 | */ |
802 | if (ib_conn->post_recv_buf_count == 0 && | 857 | iser_disconnected_handler(cma_id); |
803 | (atomic_read(&ib_conn->post_send_buf_count) == 0)) { | 858 | iser_free_ib_conn_res(iser_conn, destroy_device); |
804 | complete(&iser_conn->flush_completion); | 859 | complete(&iser_conn->ib_completion); |
805 | } | 860 | }; |
806 | } | ||
807 | 861 | ||
808 | static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) | 862 | static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) |
809 | { | 863 | { |
810 | struct iser_conn *iser_conn; | 864 | struct iser_conn *iser_conn; |
865 | int ret = 0; | ||
811 | 866 | ||
812 | iser_conn = (struct iser_conn *)cma_id->context; | 867 | iser_conn = (struct iser_conn *)cma_id->context; |
813 | iser_info("event %d status %d conn %p id %p\n", | 868 | iser_info("event %d status %d conn %p id %p\n", |
@@ -832,17 +887,29 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve | |||
832 | iser_connect_error(cma_id); | 887 | iser_connect_error(cma_id); |
833 | break; | 888 | break; |
834 | case RDMA_CM_EVENT_DISCONNECTED: | 889 | case RDMA_CM_EVENT_DISCONNECTED: |
835 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | ||
836 | case RDMA_CM_EVENT_ADDR_CHANGE: | 890 | case RDMA_CM_EVENT_ADDR_CHANGE: |
837 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: | ||
838 | iser_disconnected_handler(cma_id); | 891 | iser_disconnected_handler(cma_id); |
839 | break; | 892 | break; |
893 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | ||
894 | /* | ||
895 | * we *must* destroy the device as we cannot rely | ||
896 | * on iscsid to be around to initiate error handling. | ||
897 | * also implicitly destroy the cma_id. | ||
898 | */ | ||
899 | iser_cleanup_handler(cma_id, true); | ||
900 | iser_conn->ib_conn.cma_id = NULL; | ||
901 | ret = 1; | ||
902 | break; | ||
903 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: | ||
904 | iser_cleanup_handler(cma_id, false); | ||
905 | break; | ||
840 | default: | 906 | default: |
841 | iser_err("Unexpected RDMA CM event (%d)\n", event->event); | 907 | iser_err("Unexpected RDMA CM event (%d)\n", event->event); |
842 | break; | 908 | break; |
843 | } | 909 | } |
844 | mutex_unlock(&iser_conn->state_mutex); | 910 | mutex_unlock(&iser_conn->state_mutex); |
845 | return 0; | 911 | |
912 | return ret; | ||
846 | } | 913 | } |
847 | 914 | ||
848 | void iser_conn_init(struct iser_conn *iser_conn) | 915 | void iser_conn_init(struct iser_conn *iser_conn) |
@@ -851,7 +918,7 @@ void iser_conn_init(struct iser_conn *iser_conn) | |||
851 | iser_conn->ib_conn.post_recv_buf_count = 0; | 918 | iser_conn->ib_conn.post_recv_buf_count = 0; |
852 | atomic_set(&iser_conn->ib_conn.post_send_buf_count, 0); | 919 | atomic_set(&iser_conn->ib_conn.post_send_buf_count, 0); |
853 | init_completion(&iser_conn->stop_completion); | 920 | init_completion(&iser_conn->stop_completion); |
854 | init_completion(&iser_conn->flush_completion); | 921 | init_completion(&iser_conn->ib_completion); |
855 | init_completion(&iser_conn->up_completion); | 922 | init_completion(&iser_conn->up_completion); |
856 | INIT_LIST_HEAD(&iser_conn->conn_list); | 923 | INIT_LIST_HEAD(&iser_conn->conn_list); |
857 | spin_lock_init(&iser_conn->ib_conn.lock); | 924 | spin_lock_init(&iser_conn->ib_conn.lock); |
@@ -1100,28 +1167,8 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc) | |||
1100 | static void iser_handle_comp_error(struct iser_tx_desc *desc, | 1167 | static void iser_handle_comp_error(struct iser_tx_desc *desc, |
1101 | struct ib_conn *ib_conn) | 1168 | struct ib_conn *ib_conn) |
1102 | { | 1169 | { |
1103 | struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, | ||
1104 | ib_conn); | ||
1105 | |||
1106 | if (desc && desc->type == ISCSI_TX_DATAOUT) | 1170 | if (desc && desc->type == ISCSI_TX_DATAOUT) |
1107 | kmem_cache_free(ig.desc_cache, desc); | 1171 | kmem_cache_free(ig.desc_cache, desc); |
1108 | |||
1109 | if (ib_conn->post_recv_buf_count == 0 && | ||
1110 | atomic_read(&ib_conn->post_send_buf_count) == 0) { | ||
1111 | /** | ||
1112 | * getting here when the state is UP means that the conn is | ||
1113 | * being terminated asynchronously from the iSCSI layer's | ||
1114 | * perspective. It is safe to peek at the connection state | ||
1115 | * since iscsi_conn_failure is allowed to be called twice. | ||
1116 | **/ | ||
1117 | if (iser_conn->state == ISER_CONN_UP) | ||
1118 | iscsi_conn_failure(iser_conn->iscsi_conn, | ||
1119 | ISCSI_ERR_CONN_FAILED); | ||
1120 | |||
1121 | /* no more non completed posts to the QP, complete the | ||
1122 | * termination process w.o worrying on disconnect event */ | ||
1123 | complete(&iser_conn->flush_completion); | ||
1124 | } | ||
1125 | } | 1172 | } |
1126 | 1173 | ||
1127 | static int iser_drain_tx_cq(struct iser_device *device, int cq_index) | 1174 | static int iser_drain_tx_cq(struct iser_device *device, int cq_index) |