aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@mellanox.co.il>2007-02-05 15:12:23 -0500
committerRoland Dreier <rolandd@cisco.com>2007-02-10 11:00:48 -0500
commit839fcaba355abaffb7b44f0f4504093acb0b11cf (patch)
tree9e23f61ab0569ff144e6d9d4cb6a0887783f923c
parent9a6b090c0d1cd5c90f21db772dbe2fbcf14366de (diff)
IPoIB: Connected mode experimental support
The following patch adds experimental support for IPoIB connected mode, as defined by the draft from the IETF ipoib working group. The idea is to increase performance by increasing the MTU from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD. With this code, I'm able to get 800MByte/sec or more with netperf without options on a Mellanox 4x back-to-back DDR system. Some notes on code: 1. SRQ is used for scalability to large cluster sizes 2. Only RC connections are used (UC does not support SRQ now) 3. Retry count is set to 0 since spec draft warns against retries 4. Each connection is used for data transfers in only 1 direction, so each connection is either active(TX) or passive (RX). 2 sides that want to communicate create 2 connections. 5. Each active (TX) connection has a separate CQ for send completions - this keeps the code simple without CQ resize and other tricks 6. To detect stale passive side connections (where the remote side is down), we keep an LRU list of passive connections (updated once per second per connection) and destroy a connection after it has been unused for several seconds. The LRU rule makes it possible to avoid scanning connections that have recently been active. Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r--drivers/infiniband/ulp/ipoib/Kconfig16
-rw-r--r--drivers/infiniband/ulp/ipoib/Makefile1
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h215
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c1237
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c29
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c63
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c4
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c40
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_vlan.c2
9 files changed, 1575 insertions, 32 deletions
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index c75322d820d4..af78ccc4ce71 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -1,6 +1,6 @@
1config INFINIBAND_IPOIB 1config INFINIBAND_IPOIB
2 tristate "IP-over-InfiniBand" 2 tristate "IP-over-InfiniBand"
3 depends on INFINIBAND && NETDEVICES && INET 3 depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n)
4 ---help--- 4 ---help---
5 Support for the IP-over-InfiniBand protocol (IPoIB). This 5 Support for the IP-over-InfiniBand protocol (IPoIB). This
6 transports IP packets over InfiniBand so you can use your IB 6 transports IP packets over InfiniBand so you can use your IB
@@ -8,6 +8,20 @@ config INFINIBAND_IPOIB
8 8
9 See Documentation/infiniband/ipoib.txt for more information 9 See Documentation/infiniband/ipoib.txt for more information
10 10
11config INFINIBAND_IPOIB_CM
12 bool "IP-over-InfiniBand Connected Mode support"
13 depends on INFINIBAND_IPOIB && EXPERIMENTAL
14 default n
15 ---help---
16 This option enables experimental support for IPoIB connected mode.
17 After enabling this option, you need to switch to connected mode through
18 /sys/class/net/ibXXX/mode to actually create connections, and then increase
19 the interface MTU with e.g. ifconfig ib0 mtu 65520.
20
21 WARNING: Enabling connected mode will trigger some
22 packet drops for multicast and UD mode traffic from this interface,
23 unless you limit mtu for these destinations to 2044.
24
11config INFINIBAND_IPOIB_DEBUG 25config INFINIBAND_IPOIB_DEBUG
12 bool "IP-over-InfiniBand debugging" if EMBEDDED 26 bool "IP-over-InfiniBand debugging" if EMBEDDED
13 depends on INFINIBAND_IPOIB 27 depends on INFINIBAND_IPOIB
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
index 8935e74ae3f8..98ee38e8c2c4 100644
--- a/drivers/infiniband/ulp/ipoib/Makefile
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \
5 ipoib_multicast.o \ 5 ipoib_multicast.o \
6 ipoib_verbs.o \ 6 ipoib_verbs.o \
7 ipoib_vlan.o 7 ipoib_vlan.o
8ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o
8ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o 9ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o
9 10
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 07deee8f81ce..2594db2030b3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -62,6 +62,10 @@ enum {
62 62
63 IPOIB_ENCAP_LEN = 4, 63 IPOIB_ENCAP_LEN = 4,
64 64
65 IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */
66 IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
67 IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
68 IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
65 IPOIB_RX_RING_SIZE = 128, 69 IPOIB_RX_RING_SIZE = 128,
66 IPOIB_TX_RING_SIZE = 64, 70 IPOIB_TX_RING_SIZE = 64,
67 IPOIB_MAX_QUEUE_SIZE = 8192, 71 IPOIB_MAX_QUEUE_SIZE = 8192,
@@ -81,6 +85,8 @@ enum {
81 IPOIB_MCAST_RUN = 6, 85 IPOIB_MCAST_RUN = 6,
82 IPOIB_STOP_REAPER = 7, 86 IPOIB_STOP_REAPER = 7,
83 IPOIB_MCAST_STARTED = 8, 87 IPOIB_MCAST_STARTED = 8,
88 IPOIB_FLAG_NETIF_STOPPED = 9,
89 IPOIB_FLAG_ADMIN_CM = 10,
84 90
85 IPOIB_MAX_BACKOFF_SECONDS = 16, 91 IPOIB_MAX_BACKOFF_SECONDS = 16,
86 92
@@ -90,6 +96,13 @@ enum {
90 IPOIB_MCAST_FLAG_ATTACHED = 3, 96 IPOIB_MCAST_FLAG_ATTACHED = 3,
91}; 97};
92 98
99#define IPOIB_OP_RECV (1ul << 31)
100#ifdef CONFIG_INFINIBAND_IPOIB_CM
101#define IPOIB_CM_OP_SRQ (1ul << 30)
102#else
103#define IPOIB_CM_OP_SRQ (0)
104#endif
105
93/* structs */ 106/* structs */
94 107
95struct ipoib_header { 108struct ipoib_header {
@@ -113,6 +126,59 @@ struct ipoib_tx_buf {
113 u64 mapping; 126 u64 mapping;
114}; 127};
115 128
129struct ib_cm_id;
130
131struct ipoib_cm_data {
132 __be32 qpn; /* High byte MUST be ignored on receive */
133 __be32 mtu;
134};
135
136struct ipoib_cm_rx {
137 struct ib_cm_id *id;
138 struct ib_qp *qp;
139 struct list_head list;
140 struct net_device *dev;
141 unsigned long jiffies;
142};
143
144struct ipoib_cm_tx {
145 struct ib_cm_id *id;
146 struct ib_cq *cq;
147 struct ib_qp *qp;
148 struct list_head list;
149 struct net_device *dev;
150 struct ipoib_neigh *neigh;
151 struct ipoib_path *path;
152 struct ipoib_tx_buf *tx_ring;
153 unsigned tx_head;
154 unsigned tx_tail;
155 unsigned long flags;
156 u32 mtu;
157 struct ib_wc ibwc[IPOIB_NUM_WC];
158};
159
160struct ipoib_cm_rx_buf {
161 struct sk_buff *skb;
162 u64 mapping[IPOIB_CM_RX_SG];
163};
164
165struct ipoib_cm_dev_priv {
166 struct ib_srq *srq;
167 struct ipoib_cm_rx_buf *srq_ring;
168 struct ib_cm_id *id;
169 struct list_head passive_ids;
170 struct work_struct start_task;
171 struct work_struct reap_task;
172 struct work_struct skb_task;
173 struct delayed_work stale_task;
174 struct sk_buff_head skb_queue;
175 struct list_head start_list;
176 struct list_head reap_list;
177 struct ib_wc ibwc[IPOIB_NUM_WC];
178 struct ib_sge rx_sge[IPOIB_CM_RX_SG];
179 struct ib_recv_wr rx_wr;
180};
181
116/* 182/*
117 * Device private locking: tx_lock protects members used in TX fast 183 * Device private locking: tx_lock protects members used in TX fast
118 * path (and we use LLTX so upper layers don't do extra locking). 184 * path (and we use LLTX so upper layers don't do extra locking).
@@ -179,6 +245,10 @@ struct ipoib_dev_priv {
179 struct list_head child_intfs; 245 struct list_head child_intfs;
180 struct list_head list; 246 struct list_head list;
181 247
248#ifdef CONFIG_INFINIBAND_IPOIB_CM
249 struct ipoib_cm_dev_priv cm;
250#endif
251
182#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 252#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
183 struct list_head fs_list; 253 struct list_head fs_list;
184 struct dentry *mcg_dentry; 254 struct dentry *mcg_dentry;
@@ -212,6 +282,9 @@ struct ipoib_path {
212 282
213struct ipoib_neigh { 283struct ipoib_neigh {
214 struct ipoib_ah *ah; 284 struct ipoib_ah *ah;
285#ifdef CONFIG_INFINIBAND_IPOIB_CM
286 struct ipoib_cm_tx *cm;
287#endif
215 union ib_gid dgid; 288 union ib_gid dgid;
216 struct sk_buff_head queue; 289 struct sk_buff_head queue;
217 290
@@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
315void ipoib_pkey_poll(struct work_struct *work); 388void ipoib_pkey_poll(struct work_struct *work);
316int ipoib_pkey_dev_delay_open(struct net_device *dev); 389int ipoib_pkey_dev_delay_open(struct net_device *dev);
317 390
391#ifdef CONFIG_INFINIBAND_IPOIB_CM
392
393#define IPOIB_FLAGS_RC 0x80
394#define IPOIB_FLAGS_UC 0x40
395
396/* We don't support UC connections at the moment */
397#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC))
398
399static inline int ipoib_cm_admin_enabled(struct net_device *dev)
400{
401 struct ipoib_dev_priv *priv = netdev_priv(dev);
402 return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
403 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
404}
405
406static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
407{
408 struct ipoib_dev_priv *priv = netdev_priv(dev);
409 return IPOIB_CM_SUPPORTED(n->ha) &&
410 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
411}
412
413static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
414
415{
416 return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
417}
418
419static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
420{
421 return neigh->cm;
422}
423
424static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
425{
426 neigh->cm = tx;
427}
428
429void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
430int ipoib_cm_dev_open(struct net_device *dev);
431void ipoib_cm_dev_stop(struct net_device *dev);
432int ipoib_cm_dev_init(struct net_device *dev);
433int ipoib_cm_add_mode_attr(struct net_device *dev);
434void ipoib_cm_dev_cleanup(struct net_device *dev);
435struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
436 struct ipoib_neigh *neigh);
437void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
438void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
439 unsigned int mtu);
440void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
441#else
442
443struct ipoib_cm_tx;
444
445static inline int ipoib_cm_admin_enabled(struct net_device *dev)
446{
447 return 0;
448}
449static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
450
451{
452 return 0;
453}
454
455static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
456
457{
458 return 0;
459}
460
461static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
462{
463 return NULL;
464}
465
466static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
467{
468}
469
470static inline
471void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
472{
473 return;
474}
475
476static inline
477int ipoib_cm_dev_open(struct net_device *dev)
478{
479 return 0;
480}
481
482static inline
483void ipoib_cm_dev_stop(struct net_device *dev)
484{
485 return;
486}
487
488static inline
489int ipoib_cm_dev_init(struct net_device *dev)
490{
491 return -ENOSYS;
492}
493
494static inline
495void ipoib_cm_dev_cleanup(struct net_device *dev)
496{
497 return;
498}
499
500static inline
501struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
502 struct ipoib_neigh *neigh)
503{
504 return NULL;
505}
506
507static inline
508void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
509{
510 return;
511}
512
513static inline
514int ipoib_cm_add_mode_attr(struct net_device *dev)
515{
516 return 0;
517}
518
519static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
520 unsigned int mtu)
521{
522 dev_kfree_skb_any(skb);
523}
524
525static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
526{
527}
528
529#endif
530
318#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 531#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
319void ipoib_create_debug_files(struct net_device *dev); 532void ipoib_create_debug_files(struct net_device *dev);
320void ipoib_delete_debug_files(struct net_device *dev); 533void ipoib_delete_debug_files(struct net_device *dev);
@@ -392,4 +605,6 @@ extern int ipoib_debug_level;
392 605
393#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw) 606#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw)
394 607
608#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
609
395#endif /* _IPOIB_H */ 610#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644
index 000000000000..2d483874a589
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -0,0 +1,1237 @@
1/*
2 * Copyright (c) 2006 Mellanox Technologies. All rights reserved
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 * $Id$
33 */
34
35#include <rdma/ib_cm.h>
36#include <rdma/ib_cache.h>
37#include <net/dst.h>
38#include <net/icmp.h>
39#include <linux/icmpv6.h>
40
41#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
42static int data_debug_level;
43
44module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
45MODULE_PARM_DESC(cm_data_debug_level,
46 "Enable data path debug tracing for connected mode if > 0");
47#endif
48
49#include "ipoib.h"
50
51#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
52
53#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
54#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ)
55#define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
56#define IPOIB_CM_RX_UPDATE_MASK (0x3)
57
58struct ipoib_cm_id {
59 struct ib_cm_id *id;
60 int flags;
61 u32 remote_qpn;
62 u32 remote_mtu;
63};
64
65static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
66 struct ib_cm_event *event);
67
68static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv,
69 u64 mapping[IPOIB_CM_RX_SG])
70{
71 int i;
72
73 ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
74
75 for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i)
76 ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
77}
78
79static int ipoib_cm_post_receive(struct net_device *dev, int id)
80{
81 struct ipoib_dev_priv *priv = netdev_priv(dev);
82 struct ib_recv_wr *bad_wr;
83 int i, ret;
84
85 priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
86
87 for (i = 0; i < IPOIB_CM_RX_SG; ++i)
88 priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
89
90 ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
91 if (unlikely(ret)) {
92 ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
93 ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping);
94 dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
95 priv->cm.srq_ring[id].skb = NULL;
96 }
97
98 return ret;
99}
100
101static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id,
102 u64 mapping[IPOIB_CM_RX_SG])
103{
104 struct ipoib_dev_priv *priv = netdev_priv(dev);
105 struct sk_buff *skb;
106 int i;
107
108 skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
109 if (unlikely(!skb))
110 return -ENOMEM;
111
112 /*
113 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
114 * IP header to a multiple of 16.
115 */
116 skb_reserve(skb, 12);
117
118 mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
119 DMA_FROM_DEVICE);
120 if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
121 dev_kfree_skb_any(skb);
122 return -EIO;
123 }
124
125 for (i = 0; i < IPOIB_CM_RX_SG - 1; i++) {
126 struct page *page = alloc_page(GFP_ATOMIC);
127
128 if (!page)
129 goto partial_error;
130 skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
131
132 mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page,
133 0, PAGE_SIZE, DMA_TO_DEVICE);
134 if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
135 goto partial_error;
136 }
137
138 priv->cm.srq_ring[id].skb = skb;
139 return 0;
140
141partial_error:
142
143 ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
144
145 for (; i >= 0; --i)
146 ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
147
148 kfree_skb(skb);
149 return -ENOMEM;
150}
151
152static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
153 struct ipoib_cm_rx *p)
154{
155 struct ipoib_dev_priv *priv = netdev_priv(dev);
156 struct ib_qp_init_attr attr = {
157 .send_cq = priv->cq, /* does not matter, we never send anything */
158 .recv_cq = priv->cq,
159 .srq = priv->cm.srq,
160 .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
161 .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
162 .sq_sig_type = IB_SIGNAL_ALL_WR,
163 .qp_type = IB_QPT_RC,
164 .qp_context = p,
165 };
166 return ib_create_qp(priv->pd, &attr);
167}
168
169static int ipoib_cm_modify_rx_qp(struct net_device *dev,
170 struct ib_cm_id *cm_id, struct ib_qp *qp,
171 unsigned psn)
172{
173 struct ipoib_dev_priv *priv = netdev_priv(dev);
174 struct ib_qp_attr qp_attr;
175 int qp_attr_mask, ret;
176
177 qp_attr.qp_state = IB_QPS_INIT;
178 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
179 if (ret) {
180 ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
181 return ret;
182 }
183 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
184 if (ret) {
185 ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
186 return ret;
187 }
188 qp_attr.qp_state = IB_QPS_RTR;
189 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
190 if (ret) {
191 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
192 return ret;
193 }
194 qp_attr.rq_psn = psn;
195 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
196 if (ret) {
197 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
198 return ret;
199 }
200 return 0;
201}
202
203static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
204 struct ib_qp *qp, struct ib_cm_req_event_param *req,
205 unsigned psn)
206{
207 struct ipoib_dev_priv *priv = netdev_priv(dev);
208 struct ipoib_cm_data data = {};
209 struct ib_cm_rep_param rep = {};
210
211 data.qpn = cpu_to_be32(priv->qp->qp_num);
212 data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
213
214 rep.private_data = &data;
215 rep.private_data_len = sizeof data;
216 rep.flow_control = 0;
217 rep.rnr_retry_count = req->rnr_retry_count;
218 rep.target_ack_delay = 20; /* FIXME */
219 rep.srq = 1;
220 rep.qp_num = qp->qp_num;
221 rep.starting_psn = psn;
222 return ib_send_cm_rep(cm_id, &rep);
223}
224
225static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
226{
227 struct net_device *dev = cm_id->context;
228 struct ipoib_dev_priv *priv = netdev_priv(dev);
229 struct ipoib_cm_rx *p;
230 unsigned long flags;
231 unsigned psn;
232 int ret;
233
234 ipoib_dbg(priv, "REQ arrived\n");
235 p = kzalloc(sizeof *p, GFP_KERNEL);
236 if (!p)
237 return -ENOMEM;
238 p->dev = dev;
239 p->id = cm_id;
240 p->qp = ipoib_cm_create_rx_qp(dev, p);
241 if (IS_ERR(p->qp)) {
242 ret = PTR_ERR(p->qp);
243 goto err_qp;
244 }
245
246 psn = random32() & 0xffffff;
247 ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
248 if (ret)
249 goto err_modify;
250
251 ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
252 if (ret) {
253 ipoib_warn(priv, "failed to send REP: %d\n", ret);
254 goto err_rep;
255 }
256
257 cm_id->context = p;
258 p->jiffies = jiffies;
259 spin_lock_irqsave(&priv->lock, flags);
260 list_add(&p->list, &priv->cm.passive_ids);
261 spin_unlock_irqrestore(&priv->lock, flags);
262 queue_delayed_work(ipoib_workqueue,
263 &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
264 return 0;
265
266err_rep:
267err_modify:
268 ib_destroy_qp(p->qp);
269err_qp:
270 kfree(p);
271 return ret;
272}
273
274static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
275 struct ib_cm_event *event)
276{
277 struct ipoib_cm_rx *p;
278 struct ipoib_dev_priv *priv;
279 unsigned long flags;
280 int ret;
281
282 switch (event->event) {
283 case IB_CM_REQ_RECEIVED:
284 return ipoib_cm_req_handler(cm_id, event);
285 case IB_CM_DREQ_RECEIVED:
286 p = cm_id->context;
287 ib_send_cm_drep(cm_id, NULL, 0);
288 /* Fall through */
289 case IB_CM_REJ_RECEIVED:
290 p = cm_id->context;
291 priv = netdev_priv(p->dev);
292 spin_lock_irqsave(&priv->lock, flags);
293 if (list_empty(&p->list))
294 ret = 0; /* Connection is going away already. */
295 else {
296 list_del_init(&p->list);
297 ret = -ECONNRESET;
298 }
299 spin_unlock_irqrestore(&priv->lock, flags);
300 if (ret) {
301 ib_destroy_qp(p->qp);
302 kfree(p);
303 return ret;
304 }
305 return 0;
306 default:
307 return 0;
308 }
309}
310/* Adjust length of skb with fragments to match received data */
311static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
312 unsigned int length)
313{
314 int i, num_frags;
315 unsigned int size;
316
317 /* put header into skb */
318 size = min(length, hdr_space);
319 skb->tail += size;
320 skb->len += size;
321 length -= size;
322
323 num_frags = skb_shinfo(skb)->nr_frags;
324 for (i = 0; i < num_frags; i++) {
325 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
326
327 if (length == 0) {
328 /* don't need this page */
329 __free_page(frag->page);
330 --skb_shinfo(skb)->nr_frags;
331 } else {
332 size = min(length, (unsigned) PAGE_SIZE);
333
334 frag->size = size;
335 skb->data_len += size;
336 skb->truesize += size;
337 skb->len += size;
338 length -= size;
339 }
340 }
341}
342
343void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
344{
345 struct ipoib_dev_priv *priv = netdev_priv(dev);
346 unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
347 struct sk_buff *skb;
348 struct ipoib_cm_rx *p;
349 unsigned long flags;
350 u64 mapping[IPOIB_CM_RX_SG];
351
352 ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n",
353 wr_id, wc->opcode, wc->status);
354
355 if (unlikely(wr_id >= ipoib_recvq_size)) {
356 ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
357 wr_id, ipoib_recvq_size);
358 return;
359 }
360
361 skb = priv->cm.srq_ring[wr_id].skb;
362
363 if (unlikely(wc->status != IB_WC_SUCCESS)) {
364 ipoib_dbg(priv, "cm recv error "
365 "(status=%d, wrid=%d vend_err %x)\n",
366 wc->status, wr_id, wc->vendor_err);
367 ++priv->stats.rx_dropped;
368 goto repost;
369 }
370
371 if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) {
372 p = wc->qp->qp_context;
373 if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
374 spin_lock_irqsave(&priv->lock, flags);
375 p->jiffies = jiffies;
376 /* Move this entry to list head, but do
377 * not re-add it if it has been removed. */
378 if (!list_empty(&p->list))
379 list_move(&p->list, &priv->cm.passive_ids);
380 spin_unlock_irqrestore(&priv->lock, flags);
381 queue_delayed_work(ipoib_workqueue,
382 &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
383 }
384 }
385
386 if (unlikely(ipoib_cm_alloc_rx_skb(dev, wr_id, mapping))) {
387 /*
388 * If we can't allocate a new RX buffer, dump
389 * this packet and reuse the old buffer.
390 */
391 ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
392 ++priv->stats.rx_dropped;
393 goto repost;
394 }
395
396 ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[wr_id].mapping);
397 memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, sizeof mapping);
398
399 ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
400 wc->byte_len, wc->slid);
401
402 skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len);
403
404 skb->protocol = ((struct ipoib_header *) skb->data)->proto;
405 skb->mac.raw = skb->data;
406 skb_pull(skb, IPOIB_ENCAP_LEN);
407
408 dev->last_rx = jiffies;
409 ++priv->stats.rx_packets;
410 priv->stats.rx_bytes += skb->len;
411
412 skb->dev = dev;
413 /* XXX get correct PACKET_ type here */
414 skb->pkt_type = PACKET_HOST;
415 netif_rx_ni(skb);
416
417repost:
418 if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
419 ipoib_warn(priv, "ipoib_cm_post_receive failed "
420 "for buf %d\n", wr_id);
421}
422
423static inline int post_send(struct ipoib_dev_priv *priv,
424 struct ipoib_cm_tx *tx,
425 unsigned int wr_id,
426 u64 addr, int len)
427{
428 struct ib_send_wr *bad_wr;
429
430 priv->tx_sge.addr = addr;
431 priv->tx_sge.length = len;
432
433 priv->tx_wr.wr_id = wr_id;
434
435 return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
436}
437
438void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
439{
440 struct ipoib_dev_priv *priv = netdev_priv(dev);
441 struct ipoib_tx_buf *tx_req;
442 u64 addr;
443
444 if (unlikely(skb->len > tx->mtu)) {
445 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
446 skb->len, tx->mtu);
447 ++priv->stats.tx_dropped;
448 ++priv->stats.tx_errors;
449 ipoib_cm_skb_too_long(dev, skb, tx->mtu - INFINIBAND_ALEN);
450 return;
451 }
452
453 ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
454 tx->tx_head, skb->len, tx->qp->qp_num);
455
456 /*
457 * We put the skb into the tx_ring _before_ we call post_send()
458 * because it's entirely possible that the completion handler will
459 * run before we execute anything after the post_send(). That
460 * means we have to make sure everything is properly recorded and
461 * our state is consistent before we call post_send().
462 */
463 tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
464 tx_req->skb = skb;
465 addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
466 if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
467 ++priv->stats.tx_errors;
468 dev_kfree_skb_any(skb);
469 return;
470 }
471
472 tx_req->mapping = addr;
473
474 if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
475 addr, skb->len))) {
476 ipoib_warn(priv, "post_send failed\n");
477 ++priv->stats.tx_errors;
478 ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
479 dev_kfree_skb_any(skb);
480 } else {
481 dev->trans_start = jiffies;
482 ++tx->tx_head;
483
484 if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
485 ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
486 tx->qp->qp_num);
487 netif_stop_queue(dev);
488 set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
489 }
490 }
491}
492
493static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx,
494 struct ib_wc *wc)
495{
496 struct ipoib_dev_priv *priv = netdev_priv(dev);
497 unsigned int wr_id = wc->wr_id;
498 struct ipoib_tx_buf *tx_req;
499 unsigned long flags;
500
501 ipoib_dbg_data(priv, "cm send completion: id %d, op %d, status: %d\n",
502 wr_id, wc->opcode, wc->status);
503
504 if (unlikely(wr_id >= ipoib_sendq_size)) {
505 ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
506 wr_id, ipoib_sendq_size);
507 return;
508 }
509
510 tx_req = &tx->tx_ring[wr_id];
511
512 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
513
514 /* FIXME: is this right? Shouldn't we only increment on success? */
515 ++priv->stats.tx_packets;
516 priv->stats.tx_bytes += tx_req->skb->len;
517
518 dev_kfree_skb_any(tx_req->skb);
519
520 spin_lock_irqsave(&priv->tx_lock, flags);
521 ++tx->tx_tail;
522 if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) &&
523 tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
524 clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
525 netif_wake_queue(dev);
526 }
527
528 if (wc->status != IB_WC_SUCCESS &&
529 wc->status != IB_WC_WR_FLUSH_ERR) {
530 struct ipoib_neigh *neigh;
531
532 ipoib_dbg(priv, "failed cm send event "
533 "(status=%d, wrid=%d vend_err %x)\n",
534 wc->status, wr_id, wc->vendor_err);
535
536 spin_lock(&priv->lock);
537 neigh = tx->neigh;
538
539 if (neigh) {
540 neigh->cm = NULL;
541 list_del(&neigh->list);
542 if (neigh->ah)
543 ipoib_put_ah(neigh->ah);
544 ipoib_neigh_free(dev, neigh);
545
546 tx->neigh = NULL;
547 }
548
549 /* queue would be re-started anyway when TX is destroyed,
550 * but it makes sense to do it ASAP here. */
551 if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags))
552 netif_wake_queue(dev);
553
554 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
555 list_move(&tx->list, &priv->cm.reap_list);
556 queue_work(ipoib_workqueue, &priv->cm.reap_task);
557 }
558
559 clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
560
561 spin_unlock(&priv->lock);
562 }
563
564 spin_unlock_irqrestore(&priv->tx_lock, flags);
565}
566
567static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
568{
569 struct ipoib_cm_tx *tx = tx_ptr;
570 int n, i;
571
572 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
573 do {
574 n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
575 for (i = 0; i < n; ++i)
576 ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
577 } while (n == IPOIB_NUM_WC);
578}
579
580int ipoib_cm_dev_open(struct net_device *dev)
581{
582 struct ipoib_dev_priv *priv = netdev_priv(dev);
583 int ret;
584
585 if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
586 return 0;
587
588 priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
589 if (IS_ERR(priv->cm.id)) {
590 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
591 return IS_ERR(priv->cm.id);
592 }
593
594 ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
595 0, NULL);
596 if (ret) {
597 printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
598 IPOIB_CM_IETF_ID | priv->qp->qp_num);
599 ib_destroy_cm_id(priv->cm.id);
600 return ret;
601 }
602 return 0;
603}
604
605void ipoib_cm_dev_stop(struct net_device *dev)
606{
607 struct ipoib_dev_priv *priv = netdev_priv(dev);
608 struct ipoib_cm_rx *p;
609 unsigned long flags;
610
611 if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
612 return;
613
614 ib_destroy_cm_id(priv->cm.id);
615 spin_lock_irqsave(&priv->lock, flags);
616 while (!list_empty(&priv->cm.passive_ids)) {
617 p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
618 list_del_init(&p->list);
619 spin_unlock_irqrestore(&priv->lock, flags);
620 ib_destroy_cm_id(p->id);
621 ib_destroy_qp(p->qp);
622 kfree(p);
623 spin_lock_irqsave(&priv->lock, flags);
624 }
625 spin_unlock_irqrestore(&priv->lock, flags);
626
627 cancel_delayed_work(&priv->cm.stale_task);
628}
629
630static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
631{
632 struct ipoib_cm_tx *p = cm_id->context;
633 struct ipoib_dev_priv *priv = netdev_priv(p->dev);
634 struct ipoib_cm_data *data = event->private_data;
635 struct sk_buff_head skqueue;
636 struct ib_qp_attr qp_attr;
637 int qp_attr_mask, ret;
638 struct sk_buff *skb;
639 unsigned long flags;
640
641 p->mtu = be32_to_cpu(data->mtu);
642
643 if (p->mtu < priv->dev->mtu + IPOIB_ENCAP_LEN) {
644 ipoib_warn(priv, "Rejecting connection: mtu %d < device mtu %d + 4\n",
645 p->mtu, priv->dev->mtu);
646 return -EINVAL;
647 }
648
649 qp_attr.qp_state = IB_QPS_RTR;
650 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
651 if (ret) {
652 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
653 return ret;
654 }
655
656 qp_attr.rq_psn = 0 /* FIXME */;
657 ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
658 if (ret) {
659 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
660 return ret;
661 }
662
663 qp_attr.qp_state = IB_QPS_RTS;
664 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
665 if (ret) {
666 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
667 return ret;
668 }
669 ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
670 if (ret) {
671 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
672 return ret;
673 }
674
675 skb_queue_head_init(&skqueue);
676
677 spin_lock_irqsave(&priv->lock, flags);
678 set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
679 if (p->neigh)
680 while ((skb = __skb_dequeue(&p->neigh->queue)))
681 __skb_queue_tail(&skqueue, skb);
682 spin_unlock_irqrestore(&priv->lock, flags);
683
684 while ((skb = __skb_dequeue(&skqueue))) {
685 skb->dev = p->dev;
686 if (dev_queue_xmit(skb))
687 ipoib_warn(priv, "dev_queue_xmit failed "
688 "to requeue packet\n");
689 }
690
691 ret = ib_send_cm_rtu(cm_id, NULL, 0);
692 if (ret) {
693 ipoib_warn(priv, "failed to send RTU: %d\n", ret);
694 return ret;
695 }
696 return 0;
697}
698
699static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
700{
701 struct ipoib_dev_priv *priv = netdev_priv(dev);
702 struct ib_qp_init_attr attr = {};
703 attr.recv_cq = priv->cq;
704 attr.srq = priv->cm.srq;
705 attr.cap.max_send_wr = ipoib_sendq_size;
706 attr.cap.max_send_sge = 1;
707 attr.sq_sig_type = IB_SIGNAL_ALL_WR;
708 attr.qp_type = IB_QPT_RC;
709 attr.send_cq = cq;
710 return ib_create_qp(priv->pd, &attr);
711}
712
713static int ipoib_cm_send_req(struct net_device *dev,
714 struct ib_cm_id *id, struct ib_qp *qp,
715 u32 qpn,
716 struct ib_sa_path_rec *pathrec)
717{
718 struct ipoib_dev_priv *priv = netdev_priv(dev);
719 struct ipoib_cm_data data = {};
720 struct ib_cm_req_param req = {};
721
722 data.qpn = cpu_to_be32(priv->qp->qp_num);
723 data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
724
725 req.primary_path = pathrec;
726 req.alternate_path = NULL;
727 req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
728 req.qp_num = qp->qp_num;
729 req.qp_type = qp->qp_type;
730 req.private_data = &data;
731 req.private_data_len = sizeof data;
732 req.flow_control = 0;
733
734 req.starting_psn = 0; /* FIXME */
735
736 /*
737 * Pick some arbitrary defaults here; we could make these
738 * module parameters if anyone cared about setting them.
739 */
740 req.responder_resources = 4;
741 req.remote_cm_response_timeout = 20;
742 req.local_cm_response_timeout = 20;
743 req.retry_count = 0; /* RFC draft warns against retries */
744 req.rnr_retry_count = 0; /* RFC draft warns against retries */
745 req.max_cm_retries = 15;
746 req.srq = 1;
747 return ib_send_cm_req(id, &req);
748}
749
750static int ipoib_cm_modify_tx_init(struct net_device *dev,
751 struct ib_cm_id *cm_id, struct ib_qp *qp)
752{
753 struct ipoib_dev_priv *priv = netdev_priv(dev);
754 struct ib_qp_attr qp_attr;
755 int qp_attr_mask, ret;
756 ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
757 if (ret) {
758 ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret);
759 return ret;
760 }
761
762 qp_attr.qp_state = IB_QPS_INIT;
763 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
764 qp_attr.port_num = priv->port;
765 qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
766
767 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
768 if (ret) {
769 ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
770 return ret;
771 }
772 return 0;
773}
774
775static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
776 struct ib_sa_path_rec *pathrec)
777{
778 struct ipoib_dev_priv *priv = netdev_priv(p->dev);
779 int ret;
780
781 p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring,
782 GFP_KERNEL);
783 if (!p->tx_ring) {
784 ipoib_warn(priv, "failed to allocate tx ring\n");
785 ret = -ENOMEM;
786 goto err_tx;
787 }
788
789 p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
790 ipoib_sendq_size + 1);
791 if (IS_ERR(p->cq)) {
792 ret = PTR_ERR(p->cq);
793 ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
794 goto err_cq;
795 }
796
797 ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
798 if (ret) {
799 ipoib_warn(priv, "failed to request completion notification: %d\n", ret);
800 goto err_req_notify;
801 }
802
803 p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
804 if (IS_ERR(p->qp)) {
805 ret = PTR_ERR(p->qp);
806 ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
807 goto err_qp;
808 }
809
810 p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
811 if (IS_ERR(p->id)) {
812 ret = PTR_ERR(p->id);
813 ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
814 goto err_id;
815 }
816
817 ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp);
818 if (ret) {
819 ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
820 goto err_modify;
821 }
822
823 ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
824 if (ret) {
825 ipoib_warn(priv, "failed to send cm req: %d\n", ret);
826 goto err_send_cm;
827 }
828
829 ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n",
830 p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn);
831
832 return 0;
833
834err_send_cm:
835err_modify:
836 ib_destroy_cm_id(p->id);
837err_id:
838 p->id = NULL;
839 ib_destroy_qp(p->qp);
840err_req_notify:
841err_qp:
842 p->qp = NULL;
843 ib_destroy_cq(p->cq);
844err_cq:
845 p->cq = NULL;
846err_tx:
847 return ret;
848}
849
850static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
851{
852 struct ipoib_dev_priv *priv = netdev_priv(p->dev);
853 struct ipoib_tx_buf *tx_req;
854
855 ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
856 p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
857
858 if (p->id)
859 ib_destroy_cm_id(p->id);
860
861 if (p->qp)
862 ib_destroy_qp(p->qp);
863
864 if (p->cq)
865 ib_destroy_cq(p->cq);
866
867 if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
868 netif_wake_queue(p->dev);
869
870 if (p->tx_ring) {
871 while ((int) p->tx_tail - (int) p->tx_head < 0) {
872 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
873 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
874 DMA_TO_DEVICE);
875 dev_kfree_skb_any(tx_req->skb);
876 ++p->tx_tail;
877 }
878
879 kfree(p->tx_ring);
880 }
881
882 kfree(p);
883}
884
885static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
886 struct ib_cm_event *event)
887{
888 struct ipoib_cm_tx *tx = cm_id->context;
889 struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
890 struct net_device *dev = priv->dev;
891 struct ipoib_neigh *neigh;
892 unsigned long flags;
893 int ret;
894
895 switch (event->event) {
896 case IB_CM_DREQ_RECEIVED:
897 ipoib_dbg(priv, "DREQ received.\n");
898 ib_send_cm_drep(cm_id, NULL, 0);
899 break;
900 case IB_CM_REP_RECEIVED:
901 ipoib_dbg(priv, "REP received.\n");
902 ret = ipoib_cm_rep_handler(cm_id, event);
903 if (ret)
904 ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
905 NULL, 0, NULL, 0);
906 break;
907 case IB_CM_REQ_ERROR:
908 case IB_CM_REJ_RECEIVED:
909 case IB_CM_TIMEWAIT_EXIT:
910 ipoib_dbg(priv, "CM error %d.\n", event->event);
911 spin_lock_irqsave(&priv->tx_lock, flags);
912 spin_lock(&priv->lock);
913 neigh = tx->neigh;
914
915 if (neigh) {
916 neigh->cm = NULL;
917 list_del(&neigh->list);
918 if (neigh->ah)
919 ipoib_put_ah(neigh->ah);
920 ipoib_neigh_free(dev, neigh);
921
922 tx->neigh = NULL;
923 }
924
925 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
926 list_move(&tx->list, &priv->cm.reap_list);
927 queue_work(ipoib_workqueue, &priv->cm.reap_task);
928 }
929
930 spin_unlock(&priv->lock);
931 spin_unlock_irqrestore(&priv->tx_lock, flags);
932 break;
933 default:
934 break;
935 }
936
937 return 0;
938}
939
940struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
941 struct ipoib_neigh *neigh)
942{
943 struct ipoib_dev_priv *priv = netdev_priv(dev);
944 struct ipoib_cm_tx *tx;
945
946 tx = kzalloc(sizeof *tx, GFP_ATOMIC);
947 if (!tx)
948 return NULL;
949
950 neigh->cm = tx;
951 tx->neigh = neigh;
952 tx->path = path;
953 tx->dev = dev;
954 list_add(&tx->list, &priv->cm.start_list);
955 set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
956 queue_work(ipoib_workqueue, &priv->cm.start_task);
957 return tx;
958}
959
960void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
961{
962 struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
963 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
964 list_move(&tx->list, &priv->cm.reap_list);
965 queue_work(ipoib_workqueue, &priv->cm.reap_task);
966 ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n",
967 IPOIB_GID_ARG(tx->neigh->dgid));
968 tx->neigh = NULL;
969 }
970}
971
972static void ipoib_cm_tx_start(struct work_struct *work)
973{
974 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
975 cm.start_task);
976 struct net_device *dev = priv->dev;
977 struct ipoib_neigh *neigh;
978 struct ipoib_cm_tx *p;
979 unsigned long flags;
980 int ret;
981
982 struct ib_sa_path_rec pathrec;
983 u32 qpn;
984
985 spin_lock_irqsave(&priv->tx_lock, flags);
986 spin_lock(&priv->lock);
987 while (!list_empty(&priv->cm.start_list)) {
988 p = list_entry(priv->cm.start_list.next, typeof(*p), list);
989 list_del_init(&p->list);
990 neigh = p->neigh;
991 qpn = IPOIB_QPN(neigh->neighbour->ha);
992 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
993 spin_unlock(&priv->lock);
994 spin_unlock_irqrestore(&priv->tx_lock, flags);
995 ret = ipoib_cm_tx_init(p, qpn, &pathrec);
996 spin_lock_irqsave(&priv->tx_lock, flags);
997 spin_lock(&priv->lock);
998 if (ret) {
999 neigh = p->neigh;
1000 if (neigh) {
1001 neigh->cm = NULL;
1002 list_del(&neigh->list);
1003 if (neigh->ah)
1004 ipoib_put_ah(neigh->ah);
1005 ipoib_neigh_free(dev, neigh);
1006 }
1007 list_del(&p->list);
1008 kfree(p);
1009 }
1010 }
1011 spin_unlock(&priv->lock);
1012 spin_unlock_irqrestore(&priv->tx_lock, flags);
1013}
1014
1015static void ipoib_cm_tx_reap(struct work_struct *work)
1016{
1017 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1018 cm.reap_task);
1019 struct ipoib_cm_tx *p;
1020 unsigned long flags;
1021
1022 spin_lock_irqsave(&priv->tx_lock, flags);
1023 spin_lock(&priv->lock);
1024 while (!list_empty(&priv->cm.reap_list)) {
1025 p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
1026 list_del(&p->list);
1027 spin_unlock(&priv->lock);
1028 spin_unlock_irqrestore(&priv->tx_lock, flags);
1029 ipoib_cm_tx_destroy(p);
1030 spin_lock_irqsave(&priv->tx_lock, flags);
1031 spin_lock(&priv->lock);
1032 }
1033 spin_unlock(&priv->lock);
1034 spin_unlock_irqrestore(&priv->tx_lock, flags);
1035}
1036
1037static void ipoib_cm_skb_reap(struct work_struct *work)
1038{
1039 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1040 cm.skb_task);
1041 struct net_device *dev = priv->dev;
1042 struct sk_buff *skb;
1043 unsigned long flags;
1044
1045 unsigned mtu = priv->mcast_mtu;
1046
1047 spin_lock_irqsave(&priv->tx_lock, flags);
1048 spin_lock(&priv->lock);
1049 while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
1050 spin_unlock(&priv->lock);
1051 spin_unlock_irqrestore(&priv->tx_lock, flags);
1052 if (skb->protocol == htons(ETH_P_IP))
1053 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1054#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1055 else if (skb->protocol == htons(ETH_P_IPV6))
1056 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
1057#endif
1058 dev_kfree_skb_any(skb);
1059 spin_lock_irqsave(&priv->tx_lock, flags);
1060 spin_lock(&priv->lock);
1061 }
1062 spin_unlock(&priv->lock);
1063 spin_unlock_irqrestore(&priv->tx_lock, flags);
1064}
1065
1066void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
1067 unsigned int mtu)
1068{
1069 struct ipoib_dev_priv *priv = netdev_priv(dev);
1070 int e = skb_queue_empty(&priv->cm.skb_queue);
1071
1072 if (skb->dst)
1073 skb->dst->ops->update_pmtu(skb->dst, mtu);
1074
1075 skb_queue_tail(&priv->cm.skb_queue, skb);
1076 if (e)
1077 queue_work(ipoib_workqueue, &priv->cm.skb_task);
1078}
1079
1080static void ipoib_cm_stale_task(struct work_struct *work)
1081{
1082 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1083 cm.stale_task.work);
1084 struct ipoib_cm_rx *p;
1085 unsigned long flags;
1086
1087 spin_lock_irqsave(&priv->lock, flags);
1088 while (!list_empty(&priv->cm.passive_ids)) {
1089 /* List if sorted by LRU, start from tail,
1090 * stop when we see a recently used entry */
1091 p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1092 if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
1093 break;
1094 list_del_init(&p->list);
1095 spin_unlock_irqrestore(&priv->lock, flags);
1096 ib_destroy_cm_id(p->id);
1097 ib_destroy_qp(p->qp);
1098 kfree(p);
1099 spin_lock_irqsave(&priv->lock, flags);
1100 }
1101 spin_unlock_irqrestore(&priv->lock, flags);
1102}
1103
1104
1105static ssize_t show_mode(struct device *d, struct device_attribute *attr,
1106 char *buf)
1107{
1108 struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
1109
1110 if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
1111 return sprintf(buf, "connected\n");
1112 else
1113 return sprintf(buf, "datagram\n");
1114}
1115
1116static ssize_t set_mode(struct device *d, struct device_attribute *attr,
1117 const char *buf, size_t count)
1118{
1119 struct net_device *dev = to_net_dev(d);
1120 struct ipoib_dev_priv *priv = netdev_priv(dev);
1121
1122 /* flush paths if we switch modes so that connections are restarted */
1123 if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
1124 set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
1125 ipoib_warn(priv, "enabling connected mode "
1126 "will cause multicast packet drops\n");
1127 ipoib_flush_paths(dev);
1128 return count;
1129 }
1130
1131 if (!strcmp(buf, "datagram\n")) {
1132 clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
1133 dev->mtu = min(priv->mcast_mtu, dev->mtu);
1134 ipoib_flush_paths(dev);
1135 return count;
1136 }
1137
1138 return -EINVAL;
1139}
1140
1141static DEVICE_ATTR(mode, S_IWUGO | S_IRUGO, show_mode, set_mode);
1142
1143int ipoib_cm_add_mode_attr(struct net_device *dev)
1144{
1145 return device_create_file(&dev->dev, &dev_attr_mode);
1146}
1147
1148int ipoib_cm_dev_init(struct net_device *dev)
1149{
1150 struct ipoib_dev_priv *priv = netdev_priv(dev);
1151 struct ib_srq_init_attr srq_init_attr = {
1152 .attr = {
1153 .max_wr = ipoib_recvq_size,
1154 .max_sge = IPOIB_CM_RX_SG
1155 }
1156 };
1157 int ret, i;
1158
1159 INIT_LIST_HEAD(&priv->cm.passive_ids);
1160 INIT_LIST_HEAD(&priv->cm.reap_list);
1161 INIT_LIST_HEAD(&priv->cm.start_list);
1162 INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1163 INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1164 INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
1165 INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1166
1167 skb_queue_head_init(&priv->cm.skb_queue);
1168
1169 priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
1170 if (IS_ERR(priv->cm.srq)) {
1171 ret = PTR_ERR(priv->cm.srq);
1172 priv->cm.srq = NULL;
1173 return ret;
1174 }
1175
1176 priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
1177 GFP_KERNEL);
1178 if (!priv->cm.srq_ring) {
1179 printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n",
1180 priv->ca->name, ipoib_recvq_size);
1181 ipoib_cm_dev_cleanup(dev);
1182 return -ENOMEM;
1183 }
1184
1185 for (i = 0; i < IPOIB_CM_RX_SG; ++i)
1186 priv->cm.rx_sge[i].lkey = priv->mr->lkey;
1187
1188 priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
1189 for (i = 1; i < IPOIB_CM_RX_SG; ++i)
1190 priv->cm.rx_sge[i].length = PAGE_SIZE;
1191 priv->cm.rx_wr.next = NULL;
1192 priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
1193 priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
1194
1195 for (i = 0; i < ipoib_recvq_size; ++i) {
1196 if (ipoib_cm_alloc_rx_skb(dev, i, priv->cm.srq_ring[i].mapping)) {
1197 ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
1198 ipoib_cm_dev_cleanup(dev);
1199 return -ENOMEM;
1200 }
1201 if (ipoib_cm_post_receive(dev, i)) {
1202 ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
1203 ipoib_cm_dev_cleanup(dev);
1204 return -EIO;
1205 }
1206 }
1207
1208 priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
1209 return 0;
1210}
1211
1212void ipoib_cm_dev_cleanup(struct net_device *dev)
1213{
1214 struct ipoib_dev_priv *priv = netdev_priv(dev);
1215 int i, ret;
1216
1217 if (!priv->cm.srq)
1218 return;
1219
1220 ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
1221
1222 ret = ib_destroy_srq(priv->cm.srq);
1223 if (ret)
1224 ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
1225
1226 priv->cm.srq = NULL;
1227 if (!priv->cm.srq_ring)
1228 return;
1229 for (i = 0; i < ipoib_recvq_size; ++i)
1230 if (priv->cm.srq_ring[i].skb) {
1231 ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[i].mapping);
1232 dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
1233 priv->cm.srq_ring[i].skb = NULL;
1234 }
1235 kfree(priv->cm.srq_ring);
1236 priv->cm.srq_ring = NULL;
1237}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 59d9594ed6d9..f2aa923ddbea 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level,
50 "Enable data path debug tracing if > 0"); 50 "Enable data path debug tracing if > 0");
51#endif 51#endif
52 52
53#define IPOIB_OP_RECV (1ul << 31)
54
55static DEFINE_MUTEX(pkey_mutex); 53static DEFINE_MUTEX(pkey_mutex);
56 54
57struct ipoib_ah *ipoib_create_ah(struct net_device *dev, 55struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
268 266
269 spin_lock_irqsave(&priv->tx_lock, flags); 267 spin_lock_irqsave(&priv->tx_lock, flags);
270 ++priv->tx_tail; 268 ++priv->tx_tail;
271 if (netif_queue_stopped(dev) && 269 if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
272 test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) && 270 priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
273 priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) 271 clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
274 netif_wake_queue(dev); 272 netif_wake_queue(dev);
273 }
275 spin_unlock_irqrestore(&priv->tx_lock, flags); 274 spin_unlock_irqrestore(&priv->tx_lock, flags);
276 275
277 if (wc->status != IB_WC_SUCCESS && 276 if (wc->status != IB_WC_SUCCESS &&
@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
283 282
284static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc) 283static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
285{ 284{
286 if (wc->wr_id & IPOIB_OP_RECV) 285 if (wc->wr_id & IPOIB_CM_OP_SRQ)
286 ipoib_cm_handle_rx_wc(dev, wc);
287 else if (wc->wr_id & IPOIB_OP_RECV)
287 ipoib_ib_handle_rx_wc(dev, wc); 288 ipoib_ib_handle_rx_wc(dev, wc);
288 else 289 else
289 ipoib_ib_handle_tx_wc(dev, wc); 290 ipoib_ib_handle_tx_wc(dev, wc);
@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
327 struct ipoib_tx_buf *tx_req; 328 struct ipoib_tx_buf *tx_req;
328 u64 addr; 329 u64 addr;
329 330
330 if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) { 331 if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) {
331 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", 332 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
332 skb->len, dev->mtu + INFINIBAND_ALEN); 333 skb->len, priv->mcast_mtu + INFINIBAND_ALEN);
333 ++priv->stats.tx_dropped; 334 ++priv->stats.tx_dropped;
334 ++priv->stats.tx_errors; 335 ++priv->stats.tx_errors;
335 dev_kfree_skb_any(skb); 336 ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
336 return; 337 return;
337 } 338 }
338 339
@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
372 if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { 373 if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
373 ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); 374 ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
374 netif_stop_queue(dev); 375 netif_stop_queue(dev);
376 set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
375 } 377 }
376 } 378 }
377} 379}
@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
424 return -1; 426 return -1;
425 } 427 }
426 428
429 ret = ipoib_cm_dev_open(dev);
430 if (ret) {
431 ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
432 ipoib_ib_dev_stop(dev);
433 return -1;
434 }
435
427 clear_bit(IPOIB_STOP_REAPER, &priv->flags); 436 clear_bit(IPOIB_STOP_REAPER, &priv->flags);
428 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); 437 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
429 438
@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)
509 518
510 clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 519 clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
511 520
521 ipoib_cm_dev_stop(dev);
522
512 /* 523 /*
513 * Move our QP to the error state and then reinitialize in 524 * Move our QP to the error state and then reinitialize in
514 * when all work requests have completed or have been flushed. 525 * when all work requests have completed or have been flushed.
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index af5ee2ec4499..18d27fd352ad 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -49,8 +49,6 @@
49 49
50#include <net/dst.h> 50#include <net/dst.h>
51 51
52#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
53
54MODULE_AUTHOR("Roland Dreier"); 52MODULE_AUTHOR("Roland Dreier");
55MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 53MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
56MODULE_LICENSE("Dual BSD/GPL"); 54MODULE_LICENSE("Dual BSD/GPL");
@@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev)
145 143
146 netif_stop_queue(dev); 144 netif_stop_queue(dev);
147 145
146 clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
147
148 /* 148 /*
149 * Now flush workqueue to make sure a scheduled task doesn't 149 * Now flush workqueue to make sure a scheduled task doesn't
150 * bring our internal state back up. 150 * bring our internal state back up.
@@ -178,8 +178,18 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
178{ 178{
179 struct ipoib_dev_priv *priv = netdev_priv(dev); 179 struct ipoib_dev_priv *priv = netdev_priv(dev);
180 180
181 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) 181 /* dev->mtu > 2K ==> connected mode */
182 if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) {
183 if (new_mtu > priv->mcast_mtu)
184 ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
185 priv->mcast_mtu);
186 dev->mtu = new_mtu;
187 return 0;
188 }
189
190 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
182 return -EINVAL; 191 return -EINVAL;
192 }
183 193
184 priv->admin_mtu = new_mtu; 194 priv->admin_mtu = new_mtu;
185 195
@@ -414,6 +424,20 @@ static void path_rec_completion(int status,
414 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 424 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
415 sizeof(union ib_gid)); 425 sizeof(union ib_gid));
416 426
427 if (ipoib_cm_enabled(dev, neigh->neighbour)) {
428 if (!ipoib_cm_get(neigh))
429 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
430 path,
431 neigh));
432 if (!ipoib_cm_get(neigh)) {
433 list_del(&neigh->list);
434 if (neigh->ah)
435 ipoib_put_ah(neigh->ah);
436 ipoib_neigh_free(dev, neigh);
437 continue;
438 }
439 }
440
417 while ((skb = __skb_dequeue(&neigh->queue))) 441 while ((skb = __skb_dequeue(&neigh->queue)))
418 __skb_queue_tail(&skqueue, skb); 442 __skb_queue_tail(&skqueue, skb);
419 } 443 }
@@ -520,7 +544,25 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
520 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 544 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
521 sizeof(union ib_gid)); 545 sizeof(union ib_gid));
522 546
523 ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); 547 if (ipoib_cm_enabled(dev, neigh->neighbour)) {
548 if (!ipoib_cm_get(neigh))
549 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
550 if (!ipoib_cm_get(neigh)) {
551 list_del(&neigh->list);
552 if (neigh->ah)
553 ipoib_put_ah(neigh->ah);
554 ipoib_neigh_free(dev, neigh);
555 goto err_drop;
556 }
557 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
558 __skb_queue_tail(&neigh->queue, skb);
559 else {
560 ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
561 skb_queue_len(&neigh->queue));
562 goto err_drop;
563 }
564 } else
565 ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
524 } else { 566 } else {
525 neigh->ah = NULL; 567 neigh->ah = NULL;
526 568
@@ -538,6 +580,7 @@ err_list:
538 580
539err_path: 581err_path:
540 ipoib_neigh_free(dev, neigh); 582 ipoib_neigh_free(dev, neigh);
583err_drop:
541 ++priv->stats.tx_dropped; 584 ++priv->stats.tx_dropped;
542 dev_kfree_skb_any(skb); 585 dev_kfree_skb_any(skb);
543 586
@@ -640,7 +683,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
640 683
641 neigh = *to_ipoib_neigh(skb->dst->neighbour); 684 neigh = *to_ipoib_neigh(skb->dst->neighbour);
642 685
643 if (likely(neigh->ah)) { 686 if (ipoib_cm_get(neigh)) {
687 if (ipoib_cm_up(neigh)) {
688 ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
689 goto out;
690 }
691 } else if (neigh->ah) {
644 if (unlikely(memcmp(&neigh->dgid.raw, 692 if (unlikely(memcmp(&neigh->dgid.raw,
645 skb->dst->neighbour->ha + 4, 693 skb->dst->neighbour->ha + 4,
646 sizeof(union ib_gid)))) { 694 sizeof(union ib_gid)))) {
@@ -805,6 +853,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
805 neigh->neighbour = neighbour; 853 neigh->neighbour = neighbour;
806 *to_ipoib_neigh(neighbour) = neigh; 854 *to_ipoib_neigh(neighbour) = neigh;
807 skb_queue_head_init(&neigh->queue); 855 skb_queue_head_init(&neigh->queue);
856 ipoib_cm_set(neigh, NULL);
808 857
809 return neigh; 858 return neigh;
810} 859}
@@ -818,6 +867,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
818 ++priv->stats.tx_dropped; 867 ++priv->stats.tx_dropped;
819 dev_kfree_skb_any(skb); 868 dev_kfree_skb_any(skb);
820 } 869 }
870 if (ipoib_cm_get(neigh))
871 ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
821 kfree(neigh); 872 kfree(neigh);
822} 873}
823 874
@@ -1080,6 +1131,8 @@ static struct net_device *ipoib_add_port(const char *format,
1080 1131
1081 ipoib_create_debug_files(priv->dev); 1132 ipoib_create_debug_files(priv->dev);
1082 1133
1134 if (ipoib_cm_add_mode_attr(priv->dev))
1135 goto sysfs_failed;
1083 if (ipoib_add_pkey_attr(priv->dev)) 1136 if (ipoib_add_pkey_attr(priv->dev))
1084 goto sysfs_failed; 1137 goto sysfs_failed;
1085 if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) 1138 if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index b04b72ca32ed..fea737f520fd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -597,7 +597,9 @@ void ipoib_mcast_join_task(struct work_struct *work)
597 597
598 priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) - 598 priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
599 IPOIB_ENCAP_LEN; 599 IPOIB_ENCAP_LEN;
600 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); 600
601 if (!ipoib_cm_admin_enabled(dev))
602 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
601 603
602 ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); 604 ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
603 605
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 7b717c648f72..3cb551b88756 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -168,35 +168,41 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
168 .qp_type = IB_QPT_UD 168 .qp_type = IB_QPT_UD
169 }; 169 };
170 170
171 int ret, size;
172
171 priv->pd = ib_alloc_pd(priv->ca); 173 priv->pd = ib_alloc_pd(priv->ca);
172 if (IS_ERR(priv->pd)) { 174 if (IS_ERR(priv->pd)) {
173 printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); 175 printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
174 return -ENODEV; 176 return -ENODEV;
175 } 177 }
176 178
177 priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, 179 priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
178 ipoib_sendq_size + ipoib_recvq_size + 1); 180 if (IS_ERR(priv->mr)) {
181 printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
182 goto out_free_pd;
183 }
184
185 size = ipoib_sendq_size + ipoib_recvq_size + 1;
186 ret = ipoib_cm_dev_init(dev);
187 if (!ret)
188 size += ipoib_recvq_size;
189
190 priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size);
179 if (IS_ERR(priv->cq)) { 191 if (IS_ERR(priv->cq)) {
180 printk(KERN_WARNING "%s: failed to create CQ\n", ca->name); 192 printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
181 goto out_free_pd; 193 goto out_free_mr;
182 } 194 }
183 195
184 if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP)) 196 if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
185 goto out_free_cq; 197 goto out_free_cq;
186 198
187 priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
188 if (IS_ERR(priv->mr)) {
189 printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
190 goto out_free_cq;
191 }
192
193 init_attr.send_cq = priv->cq; 199 init_attr.send_cq = priv->cq;
194 init_attr.recv_cq = priv->cq, 200 init_attr.recv_cq = priv->cq,
195 201
196 priv->qp = ib_create_qp(priv->pd, &init_attr); 202 priv->qp = ib_create_qp(priv->pd, &init_attr);
197 if (IS_ERR(priv->qp)) { 203 if (IS_ERR(priv->qp)) {
198 printk(KERN_WARNING "%s: failed to create QP\n", ca->name); 204 printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
199 goto out_free_mr; 205 goto out_free_cq;
200 } 206 }
201 207
202 priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; 208 priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
@@ -212,12 +218,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
212 218
213 return 0; 219 return 0;
214 220
215out_free_mr:
216 ib_dereg_mr(priv->mr);
217
218out_free_cq: 221out_free_cq:
219 ib_destroy_cq(priv->cq); 222 ib_destroy_cq(priv->cq);
220 223
224out_free_mr:
225 ib_dereg_mr(priv->mr);
226
221out_free_pd: 227out_free_pd:
222 ib_dealloc_pd(priv->pd); 228 ib_dealloc_pd(priv->pd);
223 return -ENODEV; 229 return -ENODEV;
@@ -235,12 +241,14 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
235 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 241 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
236 } 242 }
237 243
238 if (ib_dereg_mr(priv->mr))
239 ipoib_warn(priv, "ib_dereg_mr failed\n");
240
241 if (ib_destroy_cq(priv->cq)) 244 if (ib_destroy_cq(priv->cq))
242 ipoib_warn(priv, "ib_cq_destroy failed\n"); 245 ipoib_warn(priv, "ib_cq_destroy failed\n");
243 246
247 ipoib_cm_dev_cleanup(dev);
248
249 if (ib_dereg_mr(priv->mr))
250 ipoib_warn(priv, "ib_dereg_mr failed\n");
251
244 if (ib_dealloc_pd(priv->pd)) 252 if (ib_dealloc_pd(priv->pd))
245 ipoib_warn(priv, "ib_dealloc_pd failed\n"); 253 ipoib_warn(priv, "ib_dealloc_pd failed\n");
246} 254}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 085eafe6667c..6762988439d1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -115,6 +115,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
115 115
116 ipoib_create_debug_files(priv->dev); 116 ipoib_create_debug_files(priv->dev);
117 117
118 if (ipoib_cm_add_mode_attr(priv->dev))
119 goto sysfs_failed;
118 if (ipoib_add_pkey_attr(priv->dev)) 120 if (ipoib_add_pkey_attr(priv->dev))
119 goto sysfs_failed; 121 goto sysfs_failed;
120 122