aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/ulp/ipoib/ipoib_main.c
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@mellanox.co.il>2007-02-05 15:12:23 -0500
committerRoland Dreier <rolandd@cisco.com>2007-02-10 11:00:48 -0500
commit839fcaba355abaffb7b44f0f4504093acb0b11cf (patch)
tree9e23f61ab0569ff144e6d9d4cb6a0887783f923c /drivers/infiniband/ulp/ipoib/ipoib_main.c
parent9a6b090c0d1cd5c90f21db772dbe2fbcf14366de (diff)
IPoIB: Connected mode experimental support
The following patch adds experimental support for IPoIB connected mode, as defined by the draft from the IETF ipoib working group. The idea is to increase performance by increasing the MTU from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD. With this code, I'm able to get 800MByte/sec or more with netperf without options on a Mellanox 4x back-to-back DDR system. Some notes on code: 1. SRQ is used for scalability to large cluster sizes 2. Only RC connections are used (UC does not support SRQ now) 3. Retry count is set to 0 since spec draft warns against retries 4. Each connection is used for data transfers in only 1 direction, so each connection is either active(TX) or passive (RX). 2 sides that want to communicate create 2 connections. 5. Each active (TX) connection has a separate CQ for send completions - this keeps the code simple without CQ resize and other tricks 6. To detect stale passive side connections (where the remote side is down), we keep an LRU list of passive connections (updated once per second per connection) and destroy a connection after it has been unused for several seconds. The LRU rule makes it possible to avoid scanning connections that have recently been active. Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/ulp/ipoib/ipoib_main.c')
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c63
1 files changed, 58 insertions, 5 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index af5ee2ec4499..18d27fd352ad 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -49,8 +49,6 @@
49 49
50#include <net/dst.h> 50#include <net/dst.h>
51 51
52#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
53
54MODULE_AUTHOR("Roland Dreier"); 52MODULE_AUTHOR("Roland Dreier");
55MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 53MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
56MODULE_LICENSE("Dual BSD/GPL"); 54MODULE_LICENSE("Dual BSD/GPL");
@@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev)
145 143
146 netif_stop_queue(dev); 144 netif_stop_queue(dev);
147 145
146 clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
147
148 /* 148 /*
149 * Now flush workqueue to make sure a scheduled task doesn't 149 * Now flush workqueue to make sure a scheduled task doesn't
150 * bring our internal state back up. 150 * bring our internal state back up.
@@ -178,8 +178,18 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
178{ 178{
179 struct ipoib_dev_priv *priv = netdev_priv(dev); 179 struct ipoib_dev_priv *priv = netdev_priv(dev);
180 180
181 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) 181 /* dev->mtu > 2K ==> connected mode */
182 if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) {
183 if (new_mtu > priv->mcast_mtu)
184 ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
185 priv->mcast_mtu);
186 dev->mtu = new_mtu;
187 return 0;
188 }
189
190 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
182 return -EINVAL; 191 return -EINVAL;
192 }
183 193
184 priv->admin_mtu = new_mtu; 194 priv->admin_mtu = new_mtu;
185 195
@@ -414,6 +424,20 @@ static void path_rec_completion(int status,
414 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 424 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
415 sizeof(union ib_gid)); 425 sizeof(union ib_gid));
416 426
427 if (ipoib_cm_enabled(dev, neigh->neighbour)) {
428 if (!ipoib_cm_get(neigh))
429 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
430 path,
431 neigh));
432 if (!ipoib_cm_get(neigh)) {
433 list_del(&neigh->list);
434 if (neigh->ah)
435 ipoib_put_ah(neigh->ah);
436 ipoib_neigh_free(dev, neigh);
437 continue;
438 }
439 }
440
417 while ((skb = __skb_dequeue(&neigh->queue))) 441 while ((skb = __skb_dequeue(&neigh->queue)))
418 __skb_queue_tail(&skqueue, skb); 442 __skb_queue_tail(&skqueue, skb);
419 } 443 }
@@ -520,7 +544,25 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
520 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 544 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
521 sizeof(union ib_gid)); 545 sizeof(union ib_gid));
522 546
523 ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); 547 if (ipoib_cm_enabled(dev, neigh->neighbour)) {
548 if (!ipoib_cm_get(neigh))
549 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
550 if (!ipoib_cm_get(neigh)) {
551 list_del(&neigh->list);
552 if (neigh->ah)
553 ipoib_put_ah(neigh->ah);
554 ipoib_neigh_free(dev, neigh);
555 goto err_drop;
556 }
557 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
558 __skb_queue_tail(&neigh->queue, skb);
559 else {
560 ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
561 skb_queue_len(&neigh->queue));
562 goto err_drop;
563 }
564 } else
565 ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
524 } else { 566 } else {
525 neigh->ah = NULL; 567 neigh->ah = NULL;
526 568
@@ -538,6 +580,7 @@ err_list:
538 580
539err_path: 581err_path:
540 ipoib_neigh_free(dev, neigh); 582 ipoib_neigh_free(dev, neigh);
583err_drop:
541 ++priv->stats.tx_dropped; 584 ++priv->stats.tx_dropped;
542 dev_kfree_skb_any(skb); 585 dev_kfree_skb_any(skb);
543 586
@@ -640,7 +683,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
640 683
641 neigh = *to_ipoib_neigh(skb->dst->neighbour); 684 neigh = *to_ipoib_neigh(skb->dst->neighbour);
642 685
643 if (likely(neigh->ah)) { 686 if (ipoib_cm_get(neigh)) {
687 if (ipoib_cm_up(neigh)) {
688 ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
689 goto out;
690 }
691 } else if (neigh->ah) {
644 if (unlikely(memcmp(&neigh->dgid.raw, 692 if (unlikely(memcmp(&neigh->dgid.raw,
645 skb->dst->neighbour->ha + 4, 693 skb->dst->neighbour->ha + 4,
646 sizeof(union ib_gid)))) { 694 sizeof(union ib_gid)))) {
@@ -805,6 +853,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
805 neigh->neighbour = neighbour; 853 neigh->neighbour = neighbour;
806 *to_ipoib_neigh(neighbour) = neigh; 854 *to_ipoib_neigh(neighbour) = neigh;
807 skb_queue_head_init(&neigh->queue); 855 skb_queue_head_init(&neigh->queue);
856 ipoib_cm_set(neigh, NULL);
808 857
809 return neigh; 858 return neigh;
810} 859}
@@ -818,6 +867,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
818 ++priv->stats.tx_dropped; 867 ++priv->stats.tx_dropped;
819 dev_kfree_skb_any(skb); 868 dev_kfree_skb_any(skb);
820 } 869 }
870 if (ipoib_cm_get(neigh))
871 ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
821 kfree(neigh); 872 kfree(neigh);
822} 873}
823 874
@@ -1080,6 +1131,8 @@ static struct net_device *ipoib_add_port(const char *format,
1080 1131
1081 ipoib_create_debug_files(priv->dev); 1132 ipoib_create_debug_files(priv->dev);
1082 1133
1134 if (ipoib_cm_add_mode_attr(priv->dev))
1135 goto sysfs_failed;
1083 if (ipoib_add_pkey_attr(priv->dev)) 1136 if (ipoib_add_pkey_attr(priv->dev))
1084 goto sysfs_failed; 1137 goto sysfs_failed;
1085 if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) 1138 if (device_create_file(&priv->dev->dev, &dev_attr_create_child))