aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/ulp/ipoib/ipoib_ib.c
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@mellanox.co.il>2007-02-05 15:12:23 -0500
committerRoland Dreier <rolandd@cisco.com>2007-02-10 11:00:48 -0500
commit839fcaba355abaffb7b44f0f4504093acb0b11cf (patch)
tree9e23f61ab0569ff144e6d9d4cb6a0887783f923c /drivers/infiniband/ulp/ipoib/ipoib_ib.c
parent9a6b090c0d1cd5c90f21db772dbe2fbcf14366de (diff)
IPoIB: Connected mode experimental support
The following patch adds experimental support for IPoIB connected mode, as defined by the draft from the IETF ipoib working group. The idea is to increase performance by increasing the MTU from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD. With this code, I'm able to get 800MByte/sec or more with netperf without options on a Mellanox 4x back-to-back DDR system. Some notes on code: 1. SRQ is used for scalability to large cluster sizes 2. Only RC connections are used (UC does not support SRQ now) 3. Retry count is set to 0 since spec draft warns against retries 4. Each connection is used for data transfers in only 1 direction, so each connection is either active(TX) or passive (RX). 2 sides that want to communicate create 2 connections. 5. Each active (TX) connection has a separate CQ for send completions - this keeps the code simple without CQ resize and other tricks 6. To detect stale passive side connections (where the remote side is down), we keep an LRU list of passive connections (updated once per second per connection) and destroy a connection after it has been unused for several seconds. The LRU rule makes it possible to avoid scanning connections that have recently been active. Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/ulp/ipoib/ipoib_ib.c')
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c29
1 files changed, 20 insertions, 9 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 59d9594ed6d9..f2aa923ddbea 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level,
50 "Enable data path debug tracing if > 0"); 50 "Enable data path debug tracing if > 0");
51#endif 51#endif
52 52
53#define IPOIB_OP_RECV (1ul << 31)
54
55static DEFINE_MUTEX(pkey_mutex); 53static DEFINE_MUTEX(pkey_mutex);
56 54
57struct ipoib_ah *ipoib_create_ah(struct net_device *dev, 55struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
268 266
269 spin_lock_irqsave(&priv->tx_lock, flags); 267 spin_lock_irqsave(&priv->tx_lock, flags);
270 ++priv->tx_tail; 268 ++priv->tx_tail;
271 if (netif_queue_stopped(dev) && 269 if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
272 test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) && 270 priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
273 priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) 271 clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
274 netif_wake_queue(dev); 272 netif_wake_queue(dev);
273 }
275 spin_unlock_irqrestore(&priv->tx_lock, flags); 274 spin_unlock_irqrestore(&priv->tx_lock, flags);
276 275
277 if (wc->status != IB_WC_SUCCESS && 276 if (wc->status != IB_WC_SUCCESS &&
@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
283 282
284static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc) 283static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
285{ 284{
286 if (wc->wr_id & IPOIB_OP_RECV) 285 if (wc->wr_id & IPOIB_CM_OP_SRQ)
286 ipoib_cm_handle_rx_wc(dev, wc);
287 else if (wc->wr_id & IPOIB_OP_RECV)
287 ipoib_ib_handle_rx_wc(dev, wc); 288 ipoib_ib_handle_rx_wc(dev, wc);
288 else 289 else
289 ipoib_ib_handle_tx_wc(dev, wc); 290 ipoib_ib_handle_tx_wc(dev, wc);
@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
327 struct ipoib_tx_buf *tx_req; 328 struct ipoib_tx_buf *tx_req;
328 u64 addr; 329 u64 addr;
329 330
330 if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) { 331 if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) {
331 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", 332 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
332 skb->len, dev->mtu + INFINIBAND_ALEN); 333 skb->len, priv->mcast_mtu + INFINIBAND_ALEN);
333 ++priv->stats.tx_dropped; 334 ++priv->stats.tx_dropped;
334 ++priv->stats.tx_errors; 335 ++priv->stats.tx_errors;
335 dev_kfree_skb_any(skb); 336 ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
336 return; 337 return;
337 } 338 }
338 339
@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
372 if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { 373 if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
373 ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); 374 ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
374 netif_stop_queue(dev); 375 netif_stop_queue(dev);
376 set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
375 } 377 }
376 } 378 }
377} 379}
@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
424 return -1; 426 return -1;
425 } 427 }
426 428
429 ret = ipoib_cm_dev_open(dev);
430 if (ret) {
431 ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
432 ipoib_ib_dev_stop(dev);
433 return -1;
434 }
435
427 clear_bit(IPOIB_STOP_REAPER, &priv->flags); 436 clear_bit(IPOIB_STOP_REAPER, &priv->flags);
428 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); 437 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
429 438
@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)
509 518
510 clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 519 clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
511 520
521 ipoib_cm_dev_stop(dev);
522
512 /* 523 /*
513 * Move our QP to the error state and then reinitialize in 524 * Move our QP to the error state and then reinitialize in
514 * when all work requests have completed or have been flushed. 525 * when all work requests have completed or have been flushed.