aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@dev.mellanox.co.il>2007-05-28 07:37:27 -0400
committerRoland Dreier <rolandd@cisco.com>2007-05-29 19:07:09 -0400
commitec56dc0b7f6c3fec20bbc2e98ff1a06edf2fc9b9 (patch)
tree3bb5379d1bc2cd42526fe3d97b728f0d743bca30 /drivers/infiniband
parent8b7e15772a286d0ef8e4f8eca422ce5368b6fa97 (diff)
IPoIB/cm: Fix performance regression on Mellanox
commit 518b1646 ("IPoIB/cm: Fix SRQ WR leak") introduced a severe performance regression on Mellanox cards, because keeping a QP in the error state for extended periods of time moves hardware to the slow path (until the QP is destroyed). For example, MPI latency goes from ~3 usecs to ~7 usecs. Fix this by posting a send WR on one of the QPs that are being flushed, instead of using a separate drain QP that is kept in the error state. This fixes bug <https://bugs.openfabrics.org/show_bug.cgi?id=636>, reported and bisected by Scott Weitzenkamp at Cisco and debugged by Sasha Mikheev at Voltaire. Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h3
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c74
2 files changed, 37 insertions, 40 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 158759e28a5b..285c143115cc 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -156,7 +156,7 @@ struct ipoib_cm_data {
156 * - and then invoke a Destroy QP or Reset QP. 156 * - and then invoke a Destroy QP or Reset QP.
157 * 157 *
158 * We use the second option and wait for a completion on the 158 * We use the second option and wait for a completion on the
159 * rx_drain_qp before destroying QPs attached to our SRQ. 159 * same CQ before destroying QPs attached to our SRQ.
160 */ 160 */
161 161
162enum ipoib_cm_state { 162enum ipoib_cm_state {
@@ -199,7 +199,6 @@ struct ipoib_cm_dev_priv {
199 struct ib_srq *srq; 199 struct ib_srq *srq;
200 struct ipoib_cm_rx_buf *srq_ring; 200 struct ipoib_cm_rx_buf *srq_ring;
201 struct ib_cm_id *id; 201 struct ib_cm_id *id;
202 struct ib_qp *rx_drain_qp; /* generates WR described in 10.3.1 */
203 struct list_head passive_ids; /* state: LIVE */ 202 struct list_head passive_ids; /* state: LIVE */
204 struct list_head rx_error_list; /* state: ERROR */ 203 struct list_head rx_error_list; /* state: ERROR */
205 struct list_head rx_flush_list; /* state: FLUSH, drain not started */ 204 struct list_head rx_flush_list; /* state: FLUSH, drain not started */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index f133b56fd978..076a0bbb63d7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -69,8 +69,9 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
69 69
70#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff 70#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
71 71
72static struct ib_recv_wr ipoib_cm_rx_drain_wr = { 72static struct ib_send_wr ipoib_cm_rx_drain_wr = {
73 .wr_id = IPOIB_CM_RX_DRAIN_WRID 73 .wr_id = IPOIB_CM_RX_DRAIN_WRID,
74 .opcode = IB_WR_SEND,
74}; 75};
75 76
76static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, 77static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
@@ -163,16 +164,22 @@ partial_error:
163 164
164static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv) 165static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
165{ 166{
166 struct ib_recv_wr *bad_wr; 167 struct ib_send_wr *bad_wr;
168 struct ipoib_cm_rx *p;
167 169
168 /* rx_drain_qp send queue depth is 1, so 170 /* We only reserved 1 extra slot in CQ for drain WRs, so
169 * make sure we have at most 1 outstanding WR. */ 171 * make sure we have at most 1 outstanding WR. */
170 if (list_empty(&priv->cm.rx_flush_list) || 172 if (list_empty(&priv->cm.rx_flush_list) ||
171 !list_empty(&priv->cm.rx_drain_list)) 173 !list_empty(&priv->cm.rx_drain_list))
172 return; 174 return;
173 175
174 if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr)) 176 /*
175 ipoib_warn(priv, "failed to post rx_drain wr\n"); 177 * QPs on flush list are error state. This way, a "flush
178 * error" WC will be immediately generated for each WR we post.
179 */
180 p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
181 if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
182 ipoib_warn(priv, "failed to post drain wr\n");
176 183
177 list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); 184 list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
178} 185}
@@ -199,10 +206,10 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
199 struct ipoib_dev_priv *priv = netdev_priv(dev); 206 struct ipoib_dev_priv *priv = netdev_priv(dev);
200 struct ib_qp_init_attr attr = { 207 struct ib_qp_init_attr attr = {
201 .event_handler = ipoib_cm_rx_event_handler, 208 .event_handler = ipoib_cm_rx_event_handler,
202 .send_cq = priv->cq, /* does not matter, we never send anything */ 209 .send_cq = priv->cq, /* For drain WR */
203 .recv_cq = priv->cq, 210 .recv_cq = priv->cq,
204 .srq = priv->cm.srq, 211 .srq = priv->cm.srq,
205 .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */ 212 .cap.max_send_wr = 1, /* For drain WR */
206 .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ 213 .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
207 .sq_sig_type = IB_SIGNAL_ALL_WR, 214 .sq_sig_type = IB_SIGNAL_ALL_WR,
208 .qp_type = IB_QPT_RC, 215 .qp_type = IB_QPT_RC,
@@ -242,6 +249,27 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
242 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); 249 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
243 return ret; 250 return ret;
244 } 251 }
252
253 /*
254 * Current Mellanox HCA firmware won't generate completions
255 * with error for drain WRs unless the QP has been moved to
256 * RTS first. This work-around leaves a window where a QP has
257 * moved to error asynchronously, but this will eventually get
258 * fixed in firmware, so let's not error out if modify QP
259 * fails.
260 */
261 qp_attr.qp_state = IB_QPS_RTS;
262 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
263 if (ret) {
264 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
265 return 0;
266 }
267 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
268 if (ret) {
269 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
270 return 0;
271 }
272
245 return 0; 273 return 0;
246} 274}
247 275
@@ -623,38 +651,11 @@ static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
623int ipoib_cm_dev_open(struct net_device *dev) 651int ipoib_cm_dev_open(struct net_device *dev)
624{ 652{
625 struct ipoib_dev_priv *priv = netdev_priv(dev); 653 struct ipoib_dev_priv *priv = netdev_priv(dev);
626 struct ib_qp_init_attr qp_init_attr = {
627 .send_cq = priv->cq, /* does not matter, we never send anything */
628 .recv_cq = priv->cq,
629 .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
630 .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
631 .cap.max_recv_wr = 1,
632 .cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
633 .sq_sig_type = IB_SIGNAL_ALL_WR,
634 .qp_type = IB_QPT_UC,
635 };
636 int ret; 654 int ret;
637 655
638 if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) 656 if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
639 return 0; 657 return 0;
640 658
641 priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
642 if (IS_ERR(priv->cm.rx_drain_qp)) {
643 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
644 ret = PTR_ERR(priv->cm.rx_drain_qp);
645 return ret;
646 }
647
648 /*
649 * We put the QP in error state directly. This way, a "flush
650 * error" WC will be immediately generated for each WR we post.
651 */
652 ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
653 if (ret) {
654 ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
655 goto err_qp;
656 }
657
658 priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); 659 priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
659 if (IS_ERR(priv->cm.id)) { 660 if (IS_ERR(priv->cm.id)) {
660 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); 661 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
@@ -676,8 +677,6 @@ err_listen:
676 ib_destroy_cm_id(priv->cm.id); 677 ib_destroy_cm_id(priv->cm.id);
677err_cm: 678err_cm:
678 priv->cm.id = NULL; 679 priv->cm.id = NULL;
679err_qp:
680 ib_destroy_qp(priv->cm.rx_drain_qp);
681 return ret; 680 return ret;
682} 681}
683 682
@@ -740,7 +739,6 @@ void ipoib_cm_dev_stop(struct net_device *dev)
740 kfree(p); 739 kfree(p);
741 } 740 }
742 741
743 ib_destroy_qp(priv->cm.rx_drain_qp);
744 cancel_delayed_work(&priv->cm.stale_task); 742 cancel_delayed_work(&priv->cm.stale_task);
745} 743}
746 744