aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/cxgb3/iwch_cm.c
diff options
context:
space:
mode:
authorSteve Wise <swise@opengridcomputing.com>2008-04-29 16:46:52 -0400
committerRoland Dreier <rolandd@cisco.com>2008-04-29 16:46:52 -0400
commitf8b0dfd15277974b5c9f3ff17f9e3ab6fdbe45ee (patch)
tree34e393cd342578f9ff223be2b631af7ab9b418aa /drivers/infiniband/hw/cxgb3/iwch_cm.c
parentccaf10d0ad17bf755750160ebe594de7261a893e (diff)
RDMA/cxgb3: Support peer-2-peer connection setup
Open MPI, Intel MPI and other applications don't respect the iWARP requirement that the client (active) side of the connection send the first RDMA message. This class of application connection setup is called peer-to-peer. Typically once the connection is setup, _both_ sides want to send data. This patch enables supporting peer-to-peer over the chelsio RNIC by enforcing this iWARP requirement in the driver itself as part of RDMA connection setup. Connection setup is extended, when the peer2peer module option is 1, such that the MPA initiator will send a 0B Read (the RTR) just after connection setup. The MPA responder will suspend SQ processing until the RTR message is received and reply-to. In the longer term, this will be handled in a standardized way by enhancing the MPA negotiation so peers can indicate whether they want/need the RTR and what type of RTR (0B read, 0B write, or 0B send) should be sent. This will be done by standardizing a few bits of the private data in order to negotiate all this. However this patch enables peer-to-peer applications now and allows most of the required firmware and driver changes to be done and tested now. Design: - Add a module option, peer2peer, to enable this mode. - New firmware support for peer-to-peer mode: - a new bit in the rdma_init WR to tell it to do peer-2-peer and what form of RTR message to send or expect. - process _all_ preposted recvs before moving the connection into rdma mode. - passive side: defer completing the rdma_init WR until all pre-posted recvs are processed. Suspend SQ processing until the RTR is received. - active side: expect and process the 0B read WR on offload TX queue. Defer completing the rdma_init WR until all pre-posted recvs are processed. Suspend SQ processing until the 0B read WR is processed from the offload TX queue. - If peer2peer is set, driver posts 0B read request on offload TX queue just after posting the rdma_init WR to the offload TX queue. - Add CQ poll logic to ignore unsolicitied read responses. Signed-off-by: Steve Wise <swise@opengridcomputing.com> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/hw/cxgb3/iwch_cm.c')
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_cm.c67
1 files changed, 49 insertions, 18 deletions
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 0b515d899f6c..d44a6df9ad8c 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -63,6 +63,10 @@ static char *states[] = {
63 NULL, 63 NULL,
64}; 64};
65 65
66int peer2peer = 0;
67module_param(peer2peer, int, 0644);
68MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
69
66static int ep_timeout_secs = 10; 70static int ep_timeout_secs = 10;
67module_param(ep_timeout_secs, int, 0644); 71module_param(ep_timeout_secs, int, 0644);
68MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " 72MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
@@ -514,7 +518,7 @@ static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb)
514 skb_reset_transport_header(skb); 518 skb_reset_transport_header(skb);
515 len = skb->len; 519 len = skb->len;
516 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); 520 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
517 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 521 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
518 req->wr_lo = htonl(V_WR_TID(ep->hwtid)); 522 req->wr_lo = htonl(V_WR_TID(ep->hwtid));
519 req->len = htonl(len); 523 req->len = htonl(len);
520 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | 524 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
@@ -565,7 +569,7 @@ static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
565 set_arp_failure_handler(skb, arp_failure_discard); 569 set_arp_failure_handler(skb, arp_failure_discard);
566 skb_reset_transport_header(skb); 570 skb_reset_transport_header(skb);
567 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); 571 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
568 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 572 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
569 req->wr_lo = htonl(V_WR_TID(ep->hwtid)); 573 req->wr_lo = htonl(V_WR_TID(ep->hwtid));
570 req->len = htonl(mpalen); 574 req->len = htonl(mpalen);
571 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | 575 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
@@ -617,7 +621,7 @@ static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
617 skb_reset_transport_header(skb); 621 skb_reset_transport_header(skb);
618 len = skb->len; 622 len = skb->len;
619 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); 623 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
620 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 624 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
621 req->wr_lo = htonl(V_WR_TID(ep->hwtid)); 625 req->wr_lo = htonl(V_WR_TID(ep->hwtid));
622 req->len = htonl(len); 626 req->len = htonl(len);
623 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | 627 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
@@ -885,6 +889,7 @@ static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
885 * the MPA header is valid. 889 * the MPA header is valid.
886 */ 890 */
887 state_set(&ep->com, FPDU_MODE); 891 state_set(&ep->com, FPDU_MODE);
892 ep->mpa_attr.initiator = 1;
888 ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; 893 ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
889 ep->mpa_attr.recv_marker_enabled = markers_enabled; 894 ep->mpa_attr.recv_marker_enabled = markers_enabled;
890 ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; 895 ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
@@ -907,8 +912,14 @@ static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
907 /* bind QP and TID with INIT_WR */ 912 /* bind QP and TID with INIT_WR */
908 err = iwch_modify_qp(ep->com.qp->rhp, 913 err = iwch_modify_qp(ep->com.qp->rhp,
909 ep->com.qp, mask, &attrs, 1); 914 ep->com.qp, mask, &attrs, 1);
910 if (!err) 915 if (err)
911 goto out; 916 goto err;
917
918 if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) {
919 iwch_post_zb_read(ep->com.qp);
920 }
921
922 goto out;
912err: 923err:
913 abort_connection(ep, skb, GFP_KERNEL); 924 abort_connection(ep, skb, GFP_KERNEL);
914out: 925out:
@@ -1001,6 +1012,7 @@ static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb)
1001 * If we get here we have accumulated the entire mpa 1012 * If we get here we have accumulated the entire mpa
1002 * start reply message including private data. 1013 * start reply message including private data.
1003 */ 1014 */
1015 ep->mpa_attr.initiator = 0;
1004 ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; 1016 ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
1005 ep->mpa_attr.recv_marker_enabled = markers_enabled; 1017 ep->mpa_attr.recv_marker_enabled = markers_enabled;
1006 ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; 1018 ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
@@ -1071,17 +1083,33 @@ static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
1071 1083
1072 PDBG("%s ep %p credits %u\n", __func__, ep, credits); 1084 PDBG("%s ep %p credits %u\n", __func__, ep, credits);
1073 1085
1074 if (credits == 0) 1086 if (credits == 0) {
1087 PDBG(KERN_ERR "%s 0 credit ack ep %p state %u\n",
1088 __func__, ep, state_read(&ep->com));
1075 return CPL_RET_BUF_DONE; 1089 return CPL_RET_BUF_DONE;
1090 }
1091
1076 BUG_ON(credits != 1); 1092 BUG_ON(credits != 1);
1077 BUG_ON(ep->mpa_skb == NULL);
1078 kfree_skb(ep->mpa_skb);
1079 ep->mpa_skb = NULL;
1080 dst_confirm(ep->dst); 1093 dst_confirm(ep->dst);
1081 if (state_read(&ep->com) == MPA_REP_SENT) { 1094 if (!ep->mpa_skb) {
1082 ep->com.rpl_done = 1; 1095 PDBG("%s rdma_init wr_ack ep %p state %u\n",
1083 PDBG("waking up ep %p\n", ep); 1096 __func__, ep, state_read(&ep->com));
1084 wake_up(&ep->com.waitq); 1097 if (ep->mpa_attr.initiator) {
1098 PDBG("%s initiator ep %p state %u\n",
1099 __func__, ep, state_read(&ep->com));
1100 if (peer2peer)
1101 iwch_post_zb_read(ep->com.qp);
1102 } else {
1103 PDBG("%s responder ep %p state %u\n",
1104 __func__, ep, state_read(&ep->com));
1105 ep->com.rpl_done = 1;
1106 wake_up(&ep->com.waitq);
1107 }
1108 } else {
1109 PDBG("%s lsm ack ep %p state %u freeing skb\n",
1110 __func__, ep, state_read(&ep->com));
1111 kfree_skb(ep->mpa_skb);
1112 ep->mpa_skb = NULL;
1085 } 1113 }
1086 return CPL_RET_BUF_DONE; 1114 return CPL_RET_BUF_DONE;
1087} 1115}
@@ -1795,16 +1823,19 @@ int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
1795 if (err) 1823 if (err)
1796 goto err; 1824 goto err;
1797 1825
1826 /* if needed, wait for wr_ack */
1827 if (iwch_rqes_posted(qp)) {
1828 wait_event(ep->com.waitq, ep->com.rpl_done);
1829 err = ep->com.rpl_err;
1830 if (err)
1831 goto err;
1832 }
1833
1798 err = send_mpa_reply(ep, conn_param->private_data, 1834 err = send_mpa_reply(ep, conn_param->private_data,
1799 conn_param->private_data_len); 1835 conn_param->private_data_len);
1800 if (err) 1836 if (err)
1801 goto err; 1837 goto err;
1802 1838
1803 /* wait for wr_ack */
1804 wait_event(ep->com.waitq, ep->com.rpl_done);
1805 err = ep->com.rpl_err;
1806 if (err)
1807 goto err;
1808 1839
1809 state_set(&ep->com, FPDU_MODE); 1840 state_set(&ep->com, FPDU_MODE);
1810 established_upcall(ep); 1841 established_upcall(ep);