diff options
author | Michael S. Tsirkin <mst@mellanox.co.il> | 2007-02-05 15:12:23 -0500 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2007-02-10 11:00:48 -0500 |
commit | 839fcaba355abaffb7b44f0f4504093acb0b11cf (patch) | |
tree | 9e23f61ab0569ff144e6d9d4cb6a0887783f923c /drivers/infiniband/ulp/ipoib/ipoib_ib.c | |
parent | 9a6b090c0d1cd5c90f21db772dbe2fbcf14366de (diff) |
IPoIB: Connected mode experimental support
The following patch adds experimental support for IPoIB connected
mode, as defined by the draft from the IETF ipoib working group. The
idea is to increase performance by increasing the MTU from the maximum
of 2K (theoretically 4K) supported by IPoIB on top of UD. With this
code, I'm able to get 800MByte/sec or more with netperf without
options on a Mellanox 4x back-to-back DDR system.
Some notes on code:
1. SRQ is used for scalability to large cluster sizes
2. Only RC connections are used (UC does not support SRQ now)
3. Retry count is set to 0 since spec draft warns against retries
4. Each connection is used for data transfers in only 1 direction, so
each connection is either active(TX) or passive (RX). 2 sides that
want to communicate create 2 connections.
5. Each active (TX) connection has a separate CQ for send completions -
this keeps the code simple without CQ resize and other tricks
6. To detect stale passive side connections (where the remote side is
down), we keep an LRU list of passive connections (updated once per
second per connection) and destroy a connection after it has been
unused for several seconds. The LRU rule makes it possible to avoid
scanning connections that have recently been active.
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/ulp/ipoib/ipoib_ib.c')
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ib.c | 29 |
1 files changed, 20 insertions, 9 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 59d9594ed6d9..f2aa923ddbea 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c | |||
@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level, | |||
50 | "Enable data path debug tracing if > 0"); | 50 | "Enable data path debug tracing if > 0"); |
51 | #endif | 51 | #endif |
52 | 52 | ||
53 | #define IPOIB_OP_RECV (1ul << 31) | ||
54 | |||
55 | static DEFINE_MUTEX(pkey_mutex); | 53 | static DEFINE_MUTEX(pkey_mutex); |
56 | 54 | ||
57 | struct ipoib_ah *ipoib_create_ah(struct net_device *dev, | 55 | struct ipoib_ah *ipoib_create_ah(struct net_device *dev, |
@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) | |||
268 | 266 | ||
269 | spin_lock_irqsave(&priv->tx_lock, flags); | 267 | spin_lock_irqsave(&priv->tx_lock, flags); |
270 | ++priv->tx_tail; | 268 | ++priv->tx_tail; |
271 | if (netif_queue_stopped(dev) && | 269 | if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) && |
272 | test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) && | 270 | priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) { |
273 | priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) | 271 | clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); |
274 | netif_wake_queue(dev); | 272 | netif_wake_queue(dev); |
273 | } | ||
275 | spin_unlock_irqrestore(&priv->tx_lock, flags); | 274 | spin_unlock_irqrestore(&priv->tx_lock, flags); |
276 | 275 | ||
277 | if (wc->status != IB_WC_SUCCESS && | 276 | if (wc->status != IB_WC_SUCCESS && |
@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) | |||
283 | 282 | ||
284 | static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc) | 283 | static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc) |
285 | { | 284 | { |
286 | if (wc->wr_id & IPOIB_OP_RECV) | 285 | if (wc->wr_id & IPOIB_CM_OP_SRQ) |
286 | ipoib_cm_handle_rx_wc(dev, wc); | ||
287 | else if (wc->wr_id & IPOIB_OP_RECV) | ||
287 | ipoib_ib_handle_rx_wc(dev, wc); | 288 | ipoib_ib_handle_rx_wc(dev, wc); |
288 | else | 289 | else |
289 | ipoib_ib_handle_tx_wc(dev, wc); | 290 | ipoib_ib_handle_tx_wc(dev, wc); |
@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, | |||
327 | struct ipoib_tx_buf *tx_req; | 328 | struct ipoib_tx_buf *tx_req; |
328 | u64 addr; | 329 | u64 addr; |
329 | 330 | ||
330 | if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) { | 331 | if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) { |
331 | ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", | 332 | ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", |
332 | skb->len, dev->mtu + INFINIBAND_ALEN); | 333 | skb->len, priv->mcast_mtu + INFINIBAND_ALEN); |
333 | ++priv->stats.tx_dropped; | 334 | ++priv->stats.tx_dropped; |
334 | ++priv->stats.tx_errors; | 335 | ++priv->stats.tx_errors; |
335 | dev_kfree_skb_any(skb); | 336 | ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); |
336 | return; | 337 | return; |
337 | } | 338 | } |
338 | 339 | ||
@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, | |||
372 | if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { | 373 | if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { |
373 | ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); | 374 | ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); |
374 | netif_stop_queue(dev); | 375 | netif_stop_queue(dev); |
376 | set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); | ||
375 | } | 377 | } |
376 | } | 378 | } |
377 | } | 379 | } |
@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev) | |||
424 | return -1; | 426 | return -1; |
425 | } | 427 | } |
426 | 428 | ||
429 | ret = ipoib_cm_dev_open(dev); | ||
430 | if (ret) { | ||
431 | ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); | ||
432 | ipoib_ib_dev_stop(dev); | ||
433 | return -1; | ||
434 | } | ||
435 | |||
427 | clear_bit(IPOIB_STOP_REAPER, &priv->flags); | 436 | clear_bit(IPOIB_STOP_REAPER, &priv->flags); |
428 | queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); | 437 | queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); |
429 | 438 | ||
@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev) | |||
509 | 518 | ||
510 | clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); | 519 | clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); |
511 | 520 | ||
521 | ipoib_cm_dev_stop(dev); | ||
522 | |||
512 | /* | 523 | /* |
513 | * Move our QP to the error state and then reinitialize in | 524 | * Move our QP to the error state and then reinitialize in |
514 | * when all work requests have completed or have been flushed. | 525 | * when all work requests have completed or have been flushed. |