aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShlomo Pongratz <shlomop@mellanox.com>2012-07-24 13:05:22 -0400
committerRoland Dreier <roland@purestorage.com>2012-07-30 10:46:50 -0400
commitb63b70d8774175b6f8393c495fe455f0fba55ce1 (patch)
tree8fe59ebd9e8d7b258e4e8727dd268d801236398d
parent5dedb9f3bd5bcb186313ea0c0cff8f2c525d4122 (diff)
IPoIB: Use a private hash table for path lookup in xmit path
Dave Miller <davem@davemloft.net> provided a detailed description of why the way IPoIB is using neighbours for its own ipoib_neigh struct is buggy: Any time an ipoib_neigh is changed, a sequence like the following is made: spin_lock_irqsave(&priv->lock, flags); /* * It's safe to call ipoib_put_ah() inside * priv->lock here, because we know that * path->ah will always hold one more reference, * so ipoib_put_ah() will never do more than * decrement the ref count. */ if (neigh->ah) ipoib_put_ah(neigh->ah); list_del(&neigh->list); ipoib_neigh_free(dev, neigh); spin_unlock_irqrestore(&priv->lock, flags); ipoib_path_lookup(skb, n, dev); This doesn't work, because you're leaving a stale pointer to the freed up ipoib_neigh in the special neigh->ha pointer cookie. Yes, it even fails with all the locking done to protect _changes_ to *ipoib_neigh(n), and with the code in ipoib_neigh_free() that NULLs out the pointer. The core issue is that read side calls to *to_ipoib_neigh(n) are not being synchronized at all, they are performed without any locking. So whether we hold the lock or not when making changes to *ipoib_neigh(n) you still can have threads see references to freed up ipoib_neigh objects. cpu 1 cpu 2 n = *ipoib_neigh() *ipoib_neigh() = NULL kfree(n) n->foo == OOPS [..] Perhaps the ipoib code can have a private path database it manages entirely itself, which holds all the necessary information and is looked up by some generic key which is available easily at transmit time and does not involve generic neighbour entries. See <http://marc.info/?l=linux-rdma&m=132812793105624&w=2> and <http://marc.info/?l=linux-rdma&w=2&r=1&s=allows+references+to+freed+memory&q=b> for the full discussion. This patch aims to solve the race conditions found in the IPoIB driver. The patch removes the connection between the core networking neighbour structure and the ipoib_neigh structure. In addition to avoiding the race described above, it allows us to handle SKBs carrying IP packets that don't have any associated neighbour. We add an ipoib_neigh hash table with N buckets where the key is the destination hardware address. The ipoib_neigh is fetched from the hash table and instead of the stashed location in the neighbour structure. The hash table uses both RCU and reference counting to guarantee that no ipoib_neigh instance is ever deleted while in use. Fetching the ipoib_neigh structure instance from the hash also makes the special code in ipoib_start_xmit that handles remote and local bonding failover redundant. Aged ipoib_neigh instances are deleted by a garbage collection task that runs every M seconds and deletes every ipoib_neigh instance that was idle for at least 2*M seconds. The deletion is safe since the ipoib_neigh instances are protected using RCU and reference count mechanisms. The number of buckets (N) and frequency of running the GC thread (M), are taken from the exported arb_tbl. Signed-off-by: Shlomo Pongratz <shlomop@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h56
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c16
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c646
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c57
4 files changed, 539 insertions, 236 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 86df632ea612..ca43901ed861 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -92,6 +92,8 @@ enum {
92 IPOIB_STOP_REAPER = 7, 92 IPOIB_STOP_REAPER = 7,
93 IPOIB_FLAG_ADMIN_CM = 9, 93 IPOIB_FLAG_ADMIN_CM = 9,
94 IPOIB_FLAG_UMCAST = 10, 94 IPOIB_FLAG_UMCAST = 10,
95 IPOIB_STOP_NEIGH_GC = 11,
96 IPOIB_NEIGH_TBL_FLUSH = 12,
95 97
96 IPOIB_MAX_BACKOFF_SECONDS = 16, 98 IPOIB_MAX_BACKOFF_SECONDS = 16,
97 99
@@ -260,6 +262,20 @@ struct ipoib_ethtool_st {
260 u16 max_coalesced_frames; 262 u16 max_coalesced_frames;
261}; 263};
262 264
265struct ipoib_neigh_hash {
266 struct ipoib_neigh __rcu **buckets;
267 struct rcu_head rcu;
268 u32 mask;
269 u32 size;
270};
271
272struct ipoib_neigh_table {
273 struct ipoib_neigh_hash __rcu *htbl;
274 rwlock_t rwlock;
275 atomic_t entries;
276 struct completion flushed;
277};
278
263/* 279/*
264 * Device private locking: network stack tx_lock protects members used 280 * Device private locking: network stack tx_lock protects members used
265 * in TX fast path, lock protects everything else. lock nests inside 281 * in TX fast path, lock protects everything else. lock nests inside
@@ -279,6 +295,8 @@ struct ipoib_dev_priv {
279 struct rb_root path_tree; 295 struct rb_root path_tree;
280 struct list_head path_list; 296 struct list_head path_list;
281 297
298 struct ipoib_neigh_table ntbl;
299
282 struct ipoib_mcast *broadcast; 300 struct ipoib_mcast *broadcast;
283 struct list_head multicast_list; 301 struct list_head multicast_list;
284 struct rb_root multicast_tree; 302 struct rb_root multicast_tree;
@@ -291,7 +309,7 @@ struct ipoib_dev_priv {
291 struct work_struct flush_heavy; 309 struct work_struct flush_heavy;
292 struct work_struct restart_task; 310 struct work_struct restart_task;
293 struct delayed_work ah_reap_task; 311 struct delayed_work ah_reap_task;
294 312 struct delayed_work neigh_reap_task;
295 struct ib_device *ca; 313 struct ib_device *ca;
296 u8 port; 314 u8 port;
297 u16 pkey; 315 u16 pkey;
@@ -377,13 +395,16 @@ struct ipoib_neigh {
377#ifdef CONFIG_INFINIBAND_IPOIB_CM 395#ifdef CONFIG_INFINIBAND_IPOIB_CM
378 struct ipoib_cm_tx *cm; 396 struct ipoib_cm_tx *cm;
379#endif 397#endif
380 union ib_gid dgid; 398 u8 daddr[INFINIBAND_ALEN];
381 struct sk_buff_head queue; 399 struct sk_buff_head queue;
382 400
383 struct neighbour *neighbour;
384 struct net_device *dev; 401 struct net_device *dev;
385 402
386 struct list_head list; 403 struct list_head list;
404 struct ipoib_neigh __rcu *hnext;
405 struct rcu_head rcu;
406 atomic_t refcnt;
407 unsigned long alive;
387}; 408};
388 409
389#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) 410#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN)
@@ -394,21 +415,17 @@ static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
394 return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; 415 return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
395} 416}
396 417
397/* 418void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
398 * We stash a pointer to our private neighbour information after our 419static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
399 * hardware address in neigh->ha. The ALIGN() expression here makes
400 * sure that this pointer is stored aligned so that an unaligned
401 * load is not needed to dereference it.
402 */
403static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
404{ 420{
405 return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) + 421 if (atomic_dec_and_test(&neigh->refcnt))
406 INFINIBAND_ALEN, sizeof(void *)); 422 ipoib_neigh_dtor(neigh);
407} 423}
408 424struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr);
409struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, 425struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
410 struct net_device *dev); 426 struct net_device *dev);
411void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); 427void ipoib_neigh_free(struct ipoib_neigh *neigh);
428void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
412 429
413extern struct workqueue_struct *ipoib_workqueue; 430extern struct workqueue_struct *ipoib_workqueue;
414 431
@@ -425,7 +442,6 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah)
425{ 442{
426 kref_put(&ah->ref, ipoib_free_ah); 443 kref_put(&ah->ref, ipoib_free_ah);
427} 444}
428
429int ipoib_open(struct net_device *dev); 445int ipoib_open(struct net_device *dev);
430int ipoib_add_pkey_attr(struct net_device *dev); 446int ipoib_add_pkey_attr(struct net_device *dev);
431int ipoib_add_umcast_attr(struct net_device *dev); 447int ipoib_add_umcast_attr(struct net_device *dev);
@@ -455,7 +471,7 @@ void ipoib_dev_cleanup(struct net_device *dev);
455 471
456void ipoib_mcast_join_task(struct work_struct *work); 472void ipoib_mcast_join_task(struct work_struct *work);
457void ipoib_mcast_carrier_on_task(struct work_struct *work); 473void ipoib_mcast_carrier_on_task(struct work_struct *work);
458void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); 474void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
459 475
460void ipoib_mcast_restart_task(struct work_struct *work); 476void ipoib_mcast_restart_task(struct work_struct *work);
461int ipoib_mcast_start_thread(struct net_device *dev); 477int ipoib_mcast_start_thread(struct net_device *dev);
@@ -517,10 +533,10 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev)
517 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); 533 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
518} 534}
519 535
520static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) 536static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
521{ 537{
522 struct ipoib_dev_priv *priv = netdev_priv(dev); 538 struct ipoib_dev_priv *priv = netdev_priv(dev);
523 return IPOIB_CM_SUPPORTED(n->ha) && 539 return IPOIB_CM_SUPPORTED(hwaddr) &&
524 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); 540 test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
525} 541}
526 542
@@ -575,7 +591,7 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev)
575{ 591{
576 return 0; 592 return 0;
577} 593}
578static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) 594static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
579 595
580{ 596{
581 return 0; 597 return 0;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 6d66ab0dd92a..95ecf4eadf5f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -811,9 +811,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
811 if (neigh) { 811 if (neigh) {
812 neigh->cm = NULL; 812 neigh->cm = NULL;
813 list_del(&neigh->list); 813 list_del(&neigh->list);
814 if (neigh->ah) 814 ipoib_neigh_free(neigh);
815 ipoib_put_ah(neigh->ah);
816 ipoib_neigh_free(dev, neigh);
817 815
818 tx->neigh = NULL; 816 tx->neigh = NULL;
819 } 817 }
@@ -1230,9 +1228,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
1230 if (neigh) { 1228 if (neigh) {
1231 neigh->cm = NULL; 1229 neigh->cm = NULL;
1232 list_del(&neigh->list); 1230 list_del(&neigh->list);
1233 if (neigh->ah) 1231 ipoib_neigh_free(neigh);
1234 ipoib_put_ah(neigh->ah);
1235 ipoib_neigh_free(dev, neigh);
1236 1232
1237 tx->neigh = NULL; 1233 tx->neigh = NULL;
1238 } 1234 }
@@ -1279,7 +1275,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
1279 list_move(&tx->list, &priv->cm.reap_list); 1275 list_move(&tx->list, &priv->cm.reap_list);
1280 queue_work(ipoib_workqueue, &priv->cm.reap_task); 1276 queue_work(ipoib_workqueue, &priv->cm.reap_task);
1281 ipoib_dbg(priv, "Reap connection for gid %pI6\n", 1277 ipoib_dbg(priv, "Reap connection for gid %pI6\n",
1282 tx->neigh->dgid.raw); 1278 tx->neigh->daddr + 4);
1283 tx->neigh = NULL; 1279 tx->neigh = NULL;
1284 } 1280 }
1285} 1281}
@@ -1304,7 +1300,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)
1304 p = list_entry(priv->cm.start_list.next, typeof(*p), list); 1300 p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1305 list_del_init(&p->list); 1301 list_del_init(&p->list);
1306 neigh = p->neigh; 1302 neigh = p->neigh;
1307 qpn = IPOIB_QPN(neigh->neighbour->ha); 1303 qpn = IPOIB_QPN(neigh->daddr);
1308 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); 1304 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
1309 1305
1310 spin_unlock_irqrestore(&priv->lock, flags); 1306 spin_unlock_irqrestore(&priv->lock, flags);
@@ -1320,9 +1316,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)
1320 if (neigh) { 1316 if (neigh) {
1321 neigh->cm = NULL; 1317 neigh->cm = NULL;
1322 list_del(&neigh->list); 1318 list_del(&neigh->list);
1323 if (neigh->ah) 1319 ipoib_neigh_free(neigh);
1324 ipoib_put_ah(neigh->ah);
1325 ipoib_neigh_free(dev, neigh);
1326 } 1320 }
1327 list_del(&p->list); 1321 list_del(&p->list);
1328 kfree(p); 1322 kfree(p);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index bbee4b2d7a13..97920b77a5d0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -46,7 +46,8 @@
46#include <linux/ip.h> 46#include <linux/ip.h>
47#include <linux/in.h> 47#include <linux/in.h>
48 48
49#include <net/dst.h> 49#include <linux/jhash.h>
50#include <net/arp.h>
50 51
51MODULE_AUTHOR("Roland Dreier"); 52MODULE_AUTHOR("Roland Dreier");
52MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 53MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
@@ -84,6 +85,7 @@ struct ib_sa_client ipoib_sa_client;
84 85
85static void ipoib_add_one(struct ib_device *device); 86static void ipoib_add_one(struct ib_device *device);
86static void ipoib_remove_one(struct ib_device *device); 87static void ipoib_remove_one(struct ib_device *device);
88static void ipoib_neigh_reclaim(struct rcu_head *rp);
87 89
88static struct ib_client ipoib_client = { 90static struct ib_client ipoib_client = {
89 .name = "ipoib", 91 .name = "ipoib",
@@ -264,30 +266,15 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path)
264 266
265static void path_free(struct net_device *dev, struct ipoib_path *path) 267static void path_free(struct net_device *dev, struct ipoib_path *path)
266{ 268{
267 struct ipoib_dev_priv *priv = netdev_priv(dev);
268 struct ipoib_neigh *neigh, *tn;
269 struct sk_buff *skb; 269 struct sk_buff *skb;
270 unsigned long flags;
271 270
272 while ((skb = __skb_dequeue(&path->queue))) 271 while ((skb = __skb_dequeue(&path->queue)))
273 dev_kfree_skb_irq(skb); 272 dev_kfree_skb_irq(skb);
274 273
275 spin_lock_irqsave(&priv->lock, flags); 274 ipoib_dbg(netdev_priv(dev), "path_free\n");
276
277 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
278 /*
279 * It's safe to call ipoib_put_ah() inside priv->lock
280 * here, because we know that path->ah will always
281 * hold one more reference, so ipoib_put_ah() will
282 * never do more than decrement the ref count.
283 */
284 if (neigh->ah)
285 ipoib_put_ah(neigh->ah);
286
287 ipoib_neigh_free(dev, neigh);
288 }
289 275
290 spin_unlock_irqrestore(&priv->lock, flags); 276 /* remove all neigh connected to this path */
277 ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
291 278
292 if (path->ah) 279 if (path->ah)
293 ipoib_put_ah(path->ah); 280 ipoib_put_ah(path->ah);
@@ -458,19 +445,15 @@ static void path_rec_completion(int status,
458 } 445 }
459 kref_get(&path->ah->ref); 446 kref_get(&path->ah->ref);
460 neigh->ah = path->ah; 447 neigh->ah = path->ah;
461 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
462 sizeof(union ib_gid));
463 448
464 if (ipoib_cm_enabled(dev, neigh->neighbour)) { 449 if (ipoib_cm_enabled(dev, neigh->daddr)) {
465 if (!ipoib_cm_get(neigh)) 450 if (!ipoib_cm_get(neigh))
466 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, 451 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
467 path, 452 path,
468 neigh)); 453 neigh));
469 if (!ipoib_cm_get(neigh)) { 454 if (!ipoib_cm_get(neigh)) {
470 list_del(&neigh->list); 455 list_del(&neigh->list);
471 if (neigh->ah) 456 ipoib_neigh_free(neigh);
472 ipoib_put_ah(neigh->ah);
473 ipoib_neigh_free(dev, neigh);
474 continue; 457 continue;
475 } 458 }
476 } 459 }
@@ -555,15 +538,15 @@ static int path_rec_start(struct net_device *dev,
555 return 0; 538 return 0;
556} 539}
557 540
558/* called with rcu_read_lock */ 541static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
559static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_device *dev) 542 struct net_device *dev)
560{ 543{
561 struct ipoib_dev_priv *priv = netdev_priv(dev); 544 struct ipoib_dev_priv *priv = netdev_priv(dev);
562 struct ipoib_path *path; 545 struct ipoib_path *path;
563 struct ipoib_neigh *neigh; 546 struct ipoib_neigh *neigh;
564 unsigned long flags; 547 unsigned long flags;
565 548
566 neigh = ipoib_neigh_alloc(n, skb->dev); 549 neigh = ipoib_neigh_alloc(daddr, dev);
567 if (!neigh) { 550 if (!neigh) {
568 ++dev->stats.tx_dropped; 551 ++dev->stats.tx_dropped;
569 dev_kfree_skb_any(skb); 552 dev_kfree_skb_any(skb);
@@ -572,9 +555,9 @@ static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_
572 555
573 spin_lock_irqsave(&priv->lock, flags); 556 spin_lock_irqsave(&priv->lock, flags);
574 557
575 path = __path_find(dev, n->ha + 4); 558 path = __path_find(dev, daddr + 4);
576 if (!path) { 559 if (!path) {
577 path = path_rec_create(dev, n->ha + 4); 560 path = path_rec_create(dev, daddr + 4);
578 if (!path) 561 if (!path)
579 goto err_path; 562 goto err_path;
580 563
@@ -586,17 +569,13 @@ static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_
586 if (path->ah) { 569 if (path->ah) {
587 kref_get(&path->ah->ref); 570 kref_get(&path->ah->ref);
588 neigh->ah = path->ah; 571 neigh->ah = path->ah;
589 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
590 sizeof(union ib_gid));
591 572
592 if (ipoib_cm_enabled(dev, neigh->neighbour)) { 573 if (ipoib_cm_enabled(dev, neigh->daddr)) {
593 if (!ipoib_cm_get(neigh)) 574 if (!ipoib_cm_get(neigh))
594 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); 575 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
595 if (!ipoib_cm_get(neigh)) { 576 if (!ipoib_cm_get(neigh)) {
596 list_del(&neigh->list); 577 list_del(&neigh->list);
597 if (neigh->ah) 578 ipoib_neigh_free(neigh);
598 ipoib_put_ah(neigh->ah);
599 ipoib_neigh_free(dev, neigh);
600 goto err_drop; 579 goto err_drop;
601 } 580 }
602 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) 581 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
@@ -608,7 +587,8 @@ static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_
608 } 587 }
609 } else { 588 } else {
610 spin_unlock_irqrestore(&priv->lock, flags); 589 spin_unlock_irqrestore(&priv->lock, flags);
611 ipoib_send(dev, skb, path->ah, IPOIB_QPN(n->ha)); 590 ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
591 ipoib_neigh_put(neigh);
612 return; 592 return;
613 } 593 }
614 } else { 594 } else {
@@ -621,35 +601,20 @@ static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_
621 } 601 }
622 602
623 spin_unlock_irqrestore(&priv->lock, flags); 603 spin_unlock_irqrestore(&priv->lock, flags);
604 ipoib_neigh_put(neigh);
624 return; 605 return;
625 606
626err_list: 607err_list:
627 list_del(&neigh->list); 608 list_del(&neigh->list);
628 609
629err_path: 610err_path:
630 ipoib_neigh_free(dev, neigh); 611 ipoib_neigh_free(neigh);
631err_drop: 612err_drop:
632 ++dev->stats.tx_dropped; 613 ++dev->stats.tx_dropped;
633 dev_kfree_skb_any(skb); 614 dev_kfree_skb_any(skb);
634 615
635 spin_unlock_irqrestore(&priv->lock, flags); 616 spin_unlock_irqrestore(&priv->lock, flags);
636} 617 ipoib_neigh_put(neigh);
637
638/* called with rcu_read_lock */
639static void ipoib_path_lookup(struct sk_buff *skb, struct neighbour *n, struct net_device *dev)
640{
641 struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
642
643 /* Look up path record for unicasts */
644 if (n->ha[4] != 0xff) {
645 neigh_add_path(skb, n, dev);
646 return;
647 }
648
649 /* Add in the P_Key for multicasts */
650 n->ha[8] = (priv->pkey >> 8) & 0xff;
651 n->ha[9] = priv->pkey & 0xff;
652 ipoib_mcast_send(dev, n->ha + 4, skb);
653} 618}
654 619
655static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, 620static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
@@ -710,96 +675,80 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
710{ 675{
711 struct ipoib_dev_priv *priv = netdev_priv(dev); 676 struct ipoib_dev_priv *priv = netdev_priv(dev);
712 struct ipoib_neigh *neigh; 677 struct ipoib_neigh *neigh;
713 struct neighbour *n = NULL; 678 struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
679 struct ipoib_header *header;
714 unsigned long flags; 680 unsigned long flags;
715 681
716 rcu_read_lock(); 682 header = (struct ipoib_header *) skb->data;
717 if (likely(skb_dst(skb))) { 683
718 n = dst_neigh_lookup_skb(skb_dst(skb), skb); 684 if (unlikely(cb->hwaddr[4] == 0xff)) {
719 if (!n) { 685 /* multicast, arrange "if" according to probability */
686 if ((header->proto != htons(ETH_P_IP)) &&
687 (header->proto != htons(ETH_P_IPV6)) &&
688 (header->proto != htons(ETH_P_ARP)) &&
689 (header->proto != htons(ETH_P_RARP))) {
690 /* ethertype not supported by IPoIB */
720 ++dev->stats.tx_dropped; 691 ++dev->stats.tx_dropped;
721 dev_kfree_skb_any(skb); 692 dev_kfree_skb_any(skb);
722 goto unlock; 693 return NETDEV_TX_OK;
723 } 694 }
695 /* Add in the P_Key for multicast*/
696 cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
697 cb->hwaddr[9] = priv->pkey & 0xff;
698
699 neigh = ipoib_neigh_get(dev, cb->hwaddr);
700 if (likely(neigh))
701 goto send_using_neigh;
702 ipoib_mcast_send(dev, cb->hwaddr, skb);
703 return NETDEV_TX_OK;
724 } 704 }
725 if (likely(n)) {
726 if (unlikely(!*to_ipoib_neigh(n))) {
727 ipoib_path_lookup(skb, n, dev);
728 goto unlock;
729 }
730
731 neigh = *to_ipoib_neigh(n);
732 705
733 if (unlikely((memcmp(&neigh->dgid.raw, 706 /* unicast, arrange "switch" according to probability */
734 n->ha + 4, 707 switch (header->proto) {
735 sizeof(union ib_gid))) || 708 case htons(ETH_P_IP):
736 (neigh->dev != dev))) { 709 case htons(ETH_P_IPV6):
737 spin_lock_irqsave(&priv->lock, flags); 710 neigh = ipoib_neigh_get(dev, cb->hwaddr);
738 /* 711 if (unlikely(!neigh)) {
739 * It's safe to call ipoib_put_ah() inside 712 neigh_add_path(skb, cb->hwaddr, dev);
740 * priv->lock here, because we know that 713 return NETDEV_TX_OK;
741 * path->ah will always hold one more reference,
742 * so ipoib_put_ah() will never do more than
743 * decrement the ref count.
744 */
745 if (neigh->ah)
746 ipoib_put_ah(neigh->ah);
747 list_del(&neigh->list);
748 ipoib_neigh_free(dev, neigh);
749 spin_unlock_irqrestore(&priv->lock, flags);
750 ipoib_path_lookup(skb, n, dev);
751 goto unlock;
752 } 714 }
715 break;
716 case htons(ETH_P_ARP):
717 case htons(ETH_P_RARP):
718 /* for unicast ARP and RARP should always perform path find */
719 unicast_arp_send(skb, dev, cb);
720 return NETDEV_TX_OK;
721 default:
722 /* ethertype not supported by IPoIB */
723 ++dev->stats.tx_dropped;
724 dev_kfree_skb_any(skb);
725 return NETDEV_TX_OK;
726 }
753 727
754 if (ipoib_cm_get(neigh)) { 728send_using_neigh:
755 if (ipoib_cm_up(neigh)) { 729 /* note we now hold a ref to neigh */
756 ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); 730 if (ipoib_cm_get(neigh)) {
757 goto unlock; 731 if (ipoib_cm_up(neigh)) {
758 } 732 ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
759 } else if (neigh->ah) { 733 goto unref;
760 ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(n->ha));
761 goto unlock;
762 } 734 }
735 } else if (neigh->ah) {
736 ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
737 goto unref;
738 }
763 739
764 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 740 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
765 spin_lock_irqsave(&priv->lock, flags); 741 spin_lock_irqsave(&priv->lock, flags);
766 __skb_queue_tail(&neigh->queue, skb); 742 __skb_queue_tail(&neigh->queue, skb);
767 spin_unlock_irqrestore(&priv->lock, flags); 743 spin_unlock_irqrestore(&priv->lock, flags);
768 } else {
769 ++dev->stats.tx_dropped;
770 dev_kfree_skb_any(skb);
771 }
772 } else { 744 } else {
773 struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; 745 ++dev->stats.tx_dropped;
774 746 dev_kfree_skb_any(skb);
775 if (cb->hwaddr[4] == 0xff) { 747 }
776 /* Add in the P_Key for multicast*/
777 cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
778 cb->hwaddr[9] = priv->pkey & 0xff;
779 748
780 ipoib_mcast_send(dev, cb->hwaddr + 4, skb); 749unref:
781 } else { 750 ipoib_neigh_put(neigh);
782 /* unicast GID -- should be ARP or RARP reply */
783
784 if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
785 (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
786 ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n",
787 skb_dst(skb) ? "neigh" : "dst",
788 be16_to_cpup((__be16 *) skb->data),
789 IPOIB_QPN(cb->hwaddr),
790 cb->hwaddr + 4);
791 dev_kfree_skb_any(skb);
792 ++dev->stats.tx_dropped;
793 goto unlock;
794 }
795 751
796 unicast_arp_send(skb, dev, cb);
797 }
798 }
799unlock:
800 if (n)
801 neigh_release(n);
802 rcu_read_unlock();
803 return NETDEV_TX_OK; 752 return NETDEV_TX_OK;
804} 753}
805 754
@@ -821,6 +770,7 @@ static int ipoib_hard_header(struct sk_buff *skb,
821 const void *daddr, const void *saddr, unsigned len) 770 const void *daddr, const void *saddr, unsigned len)
822{ 771{
823 struct ipoib_header *header; 772 struct ipoib_header *header;
773 struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
824 774
825 header = (struct ipoib_header *) skb_push(skb, sizeof *header); 775 header = (struct ipoib_header *) skb_push(skb, sizeof *header);
826 776
@@ -828,14 +778,11 @@ static int ipoib_hard_header(struct sk_buff *skb,
828 header->reserved = 0; 778 header->reserved = 0;
829 779
830 /* 780 /*
831 * If we don't have a dst_entry structure, stuff the 781 * we don't rely on dst_entry structure, always stuff the
832 * destination address into skb->cb so we can figure out where 782 * destination address into skb->cb so we can figure out where
833 * to send the packet later. 783 * to send the packet later.
834 */ 784 */
835 if (!skb_dst(skb)) { 785 memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
836 struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
837 memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
838 }
839 786
840 return 0; 787 return 0;
841} 788}
@@ -852,86 +799,438 @@ static void ipoib_set_mcast_list(struct net_device *dev)
852 queue_work(ipoib_workqueue, &priv->restart_task); 799 queue_work(ipoib_workqueue, &priv->restart_task);
853} 800}
854 801
855static void ipoib_neigh_cleanup(struct neighbour *n) 802static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
856{ 803{
857 struct ipoib_neigh *neigh; 804 /*
858 struct ipoib_dev_priv *priv = netdev_priv(n->dev); 805 * Use only the address parts that contributes to spreading
806 * The subnet prefix is not used as one can not connect to
807 * same remote port (GUID) using the same remote QPN via two
808 * different subnets.
809 */
810 /* qpn octets[1:4) & port GUID octets[12:20) */
811 u32 *daddr_32 = (u32 *) daddr;
812 u32 hv;
813
814 hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0);
815 return hv & htbl->mask;
816}
817
818struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
819{
820 struct ipoib_dev_priv *priv = netdev_priv(dev);
821 struct ipoib_neigh_table *ntbl = &priv->ntbl;
822 struct ipoib_neigh_hash *htbl;
823 struct ipoib_neigh *neigh = NULL;
824 u32 hash_val;
825
826 rcu_read_lock_bh();
827
828 htbl = rcu_dereference_bh(ntbl->htbl);
829
830 if (!htbl)
831 goto out_unlock;
832
833 hash_val = ipoib_addr_hash(htbl, daddr);
834 for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
835 neigh != NULL;
836 neigh = rcu_dereference_bh(neigh->hnext)) {
837 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
838 /* found, take one ref on behalf of the caller */
839 if (!atomic_inc_not_zero(&neigh->refcnt)) {
840 /* deleted */
841 neigh = NULL;
842 goto out_unlock;
843 }
844 neigh->alive = jiffies;
845 goto out_unlock;
846 }
847 }
848
849out_unlock:
850 rcu_read_unlock_bh();
851 return neigh;
852}
853
854static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
855{
856 struct ipoib_neigh_table *ntbl = &priv->ntbl;
857 struct ipoib_neigh_hash *htbl;
858 unsigned long neigh_obsolete;
859 unsigned long dt;
859 unsigned long flags; 860 unsigned long flags;
860 struct ipoib_ah *ah = NULL; 861 int i;
861 862
862 neigh = *to_ipoib_neigh(n); 863 if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
863 if (neigh)
864 priv = netdev_priv(neigh->dev);
865 else
866 return; 864 return;
867 ipoib_dbg(priv,
868 "neigh_cleanup for %06x %pI6\n",
869 IPOIB_QPN(n->ha),
870 n->ha + 4);
871 865
872 spin_lock_irqsave(&priv->lock, flags); 866 write_lock_bh(&ntbl->rwlock);
867
868 htbl = rcu_dereference_protected(ntbl->htbl,
869 lockdep_is_held(&ntbl->rwlock));
870
871 if (!htbl)
872 goto out_unlock;
873
874 /* neigh is obsolete if it was idle for two GC periods */
875 dt = 2 * arp_tbl.gc_interval;
876 neigh_obsolete = jiffies - dt;
877 /* handle possible race condition */
878 if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
879 goto out_unlock;
880
881 for (i = 0; i < htbl->size; i++) {
882 struct ipoib_neigh *neigh;
883 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
884
885 while ((neigh = rcu_dereference_protected(*np,
886 lockdep_is_held(&ntbl->rwlock))) != NULL) {
887 /* was the neigh idle for two GC periods */
888 if (time_after(neigh_obsolete, neigh->alive)) {
889 rcu_assign_pointer(*np,
890 rcu_dereference_protected(neigh->hnext,
891 lockdep_is_held(&ntbl->rwlock)));
892 /* remove from path/mc list */
893 spin_lock_irqsave(&priv->lock, flags);
894 list_del(&neigh->list);
895 spin_unlock_irqrestore(&priv->lock, flags);
896 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
897 } else {
898 np = &neigh->hnext;
899 }
873 900
874 if (neigh->ah) 901 }
875 ah = neigh->ah; 902 }
876 list_del(&neigh->list);
877 ipoib_neigh_free(n->dev, neigh);
878 903
879 spin_unlock_irqrestore(&priv->lock, flags); 904out_unlock:
905 write_unlock_bh(&ntbl->rwlock);
906}
880 907
881 if (ah) 908static void ipoib_reap_neigh(struct work_struct *work)
882 ipoib_put_ah(ah); 909{
910 struct ipoib_dev_priv *priv =
911 container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
912
913 __ipoib_reap_neigh(priv);
914
915 if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
916 queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
917 arp_tbl.gc_interval);
883} 918}
884 919
885struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, 920
921static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
886 struct net_device *dev) 922 struct net_device *dev)
887{ 923{
888 struct ipoib_neigh *neigh; 924 struct ipoib_neigh *neigh;
889 925
890 neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); 926 neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
891 if (!neigh) 927 if (!neigh)
892 return NULL; 928 return NULL;
893 929
894 neigh->neighbour = neighbour;
895 neigh->dev = dev; 930 neigh->dev = dev;
896 memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); 931 memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
897 *to_ipoib_neigh(neighbour) = neigh;
898 skb_queue_head_init(&neigh->queue); 932 skb_queue_head_init(&neigh->queue);
933 INIT_LIST_HEAD(&neigh->list);
899 ipoib_cm_set(neigh, NULL); 934 ipoib_cm_set(neigh, NULL);
935 /* one ref on behalf of the caller */
936 atomic_set(&neigh->refcnt, 1);
937
938 return neigh;
939}
940
941struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
942 struct net_device *dev)
943{
944 struct ipoib_dev_priv *priv = netdev_priv(dev);
945 struct ipoib_neigh_table *ntbl = &priv->ntbl;
946 struct ipoib_neigh_hash *htbl;
947 struct ipoib_neigh *neigh;
948 u32 hash_val;
949
950 write_lock_bh(&ntbl->rwlock);
951
952 htbl = rcu_dereference_protected(ntbl->htbl,
953 lockdep_is_held(&ntbl->rwlock));
954 if (!htbl) {
955 neigh = NULL;
956 goto out_unlock;
957 }
958
959 /* need to add a new neigh, but maybe some other thread succeeded?
960 * recalc hash, maybe hash resize took place so we do a search
961 */
962 hash_val = ipoib_addr_hash(htbl, daddr);
963 for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
964 lockdep_is_held(&ntbl->rwlock));
965 neigh != NULL;
966 neigh = rcu_dereference_protected(neigh->hnext,
967 lockdep_is_held(&ntbl->rwlock))) {
968 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
969 /* found, take one ref on behalf of the caller */
970 if (!atomic_inc_not_zero(&neigh->refcnt)) {
971 /* deleted */
972 neigh = NULL;
973 break;
974 }
975 neigh->alive = jiffies;
976 goto out_unlock;
977 }
978 }
979
980 neigh = ipoib_neigh_ctor(daddr, dev);
981 if (!neigh)
982 goto out_unlock;
983
984 /* one ref on behalf of the hash table */
985 atomic_inc(&neigh->refcnt);
986 neigh->alive = jiffies;
987 /* put in hash */
988 rcu_assign_pointer(neigh->hnext,
989 rcu_dereference_protected(htbl->buckets[hash_val],
990 lockdep_is_held(&ntbl->rwlock)));
991 rcu_assign_pointer(htbl->buckets[hash_val], neigh);
992 atomic_inc(&ntbl->entries);
993
994out_unlock:
995 write_unlock_bh(&ntbl->rwlock);
900 996
901 return neigh; 997 return neigh;
902} 998}
903 999
904void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) 1000void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
905{ 1001{
1002 /* neigh reference count was dropprd to zero */
1003 struct net_device *dev = neigh->dev;
1004 struct ipoib_dev_priv *priv = netdev_priv(dev);
906 struct sk_buff *skb; 1005 struct sk_buff *skb;
907 *to_ipoib_neigh(neigh->neighbour) = NULL; 1006 if (neigh->ah)
1007 ipoib_put_ah(neigh->ah);
908 while ((skb = __skb_dequeue(&neigh->queue))) { 1008 while ((skb = __skb_dequeue(&neigh->queue))) {
909 ++dev->stats.tx_dropped; 1009 ++dev->stats.tx_dropped;
910 dev_kfree_skb_any(skb); 1010 dev_kfree_skb_any(skb);
911 } 1011 }
912 if (ipoib_cm_get(neigh)) 1012 if (ipoib_cm_get(neigh))
913 ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); 1013 ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1014 ipoib_dbg(netdev_priv(dev),
1015 "neigh free for %06x %pI6\n",
1016 IPOIB_QPN(neigh->daddr),
1017 neigh->daddr + 4);
914 kfree(neigh); 1018 kfree(neigh);
1019 if (atomic_dec_and_test(&priv->ntbl.entries)) {
1020 if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1021 complete(&priv->ntbl.flushed);
1022 }
1023}
1024
1025static void ipoib_neigh_reclaim(struct rcu_head *rp)
1026{
1027 /* Called as a result of removal from hash table */
1028 struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1029 /* note TX context may hold another ref */
1030 ipoib_neigh_put(neigh);
915} 1031}
916 1032
917static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) 1033void ipoib_neigh_free(struct ipoib_neigh *neigh)
918{ 1034{
919 parms->neigh_cleanup = ipoib_neigh_cleanup; 1035 struct net_device *dev = neigh->dev;
1036 struct ipoib_dev_priv *priv = netdev_priv(dev);
1037 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1038 struct ipoib_neigh_hash *htbl;
1039 struct ipoib_neigh __rcu **np;
1040 struct ipoib_neigh *n;
1041 u32 hash_val;
1042
1043 write_lock_bh(&ntbl->rwlock);
1044
1045 htbl = rcu_dereference_protected(ntbl->htbl,
1046 lockdep_is_held(&ntbl->rwlock));
1047 if (!htbl)
1048 goto out_unlock;
1049
1050 hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1051 np = &htbl->buckets[hash_val];
1052 for (n = rcu_dereference_protected(*np,
1053 lockdep_is_held(&ntbl->rwlock));
1054 n != NULL;
1055 n = rcu_dereference_protected(neigh->hnext,
1056 lockdep_is_held(&ntbl->rwlock))) {
1057 if (n == neigh) {
1058 /* found */
1059 rcu_assign_pointer(*np,
1060 rcu_dereference_protected(neigh->hnext,
1061 lockdep_is_held(&ntbl->rwlock)));
1062 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1063 goto out_unlock;
1064 } else {
1065 np = &n->hnext;
1066 }
1067 }
1068
1069out_unlock:
1070 write_unlock_bh(&ntbl->rwlock);
1071
1072}
1073
1074static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1075{
1076 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1077 struct ipoib_neigh_hash *htbl;
1078 struct ipoib_neigh **buckets;
1079 u32 size;
1080
1081 clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1082 ntbl->htbl = NULL;
1083 rwlock_init(&ntbl->rwlock);
1084 htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1085 if (!htbl)
1086 return -ENOMEM;
1087 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1088 size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1089 buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1090 if (!buckets) {
1091 kfree(htbl);
1092 return -ENOMEM;
1093 }
1094 htbl->size = size;
1095 htbl->mask = (size - 1);
1096 htbl->buckets = buckets;
1097 ntbl->htbl = htbl;
1098 atomic_set(&ntbl->entries, 0);
1099
1100 /* start garbage collection */
1101 clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1102 queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
1103 arp_tbl.gc_interval);
920 1104
921 return 0; 1105 return 0;
922} 1106}
923 1107
1108static void neigh_hash_free_rcu(struct rcu_head *head)
1109{
1110 struct ipoib_neigh_hash *htbl = container_of(head,
1111 struct ipoib_neigh_hash,
1112 rcu);
1113 struct ipoib_neigh __rcu **buckets = htbl->buckets;
1114
1115 kfree(buckets);
1116 kfree(htbl);
1117}
1118
1119void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1120{
1121 struct ipoib_dev_priv *priv = netdev_priv(dev);
1122 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1123 struct ipoib_neigh_hash *htbl;
1124 unsigned long flags;
1125 int i;
1126
1127 /* remove all neigh connected to a given path or mcast */
1128 write_lock_bh(&ntbl->rwlock);
1129
1130 htbl = rcu_dereference_protected(ntbl->htbl,
1131 lockdep_is_held(&ntbl->rwlock));
1132
1133 if (!htbl)
1134 goto out_unlock;
1135
1136 for (i = 0; i < htbl->size; i++) {
1137 struct ipoib_neigh *neigh;
1138 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1139
1140 while ((neigh = rcu_dereference_protected(*np,
1141 lockdep_is_held(&ntbl->rwlock))) != NULL) {
1142 /* delete neighs belong to this parent */
1143 if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1144 rcu_assign_pointer(*np,
1145 rcu_dereference_protected(neigh->hnext,
1146 lockdep_is_held(&ntbl->rwlock)));
1147 /* remove from parent list */
1148 spin_lock_irqsave(&priv->lock, flags);
1149 list_del(&neigh->list);
1150 spin_unlock_irqrestore(&priv->lock, flags);
1151 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1152 } else {
1153 np = &neigh->hnext;
1154 }
1155
1156 }
1157 }
1158out_unlock:
1159 write_unlock_bh(&ntbl->rwlock);
1160}
1161
1162static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1163{
1164 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1165 struct ipoib_neigh_hash *htbl;
1166 unsigned long flags;
1167 int i;
1168
1169 write_lock_bh(&ntbl->rwlock);
1170
1171 htbl = rcu_dereference_protected(ntbl->htbl,
1172 lockdep_is_held(&ntbl->rwlock));
1173 if (!htbl)
1174 goto out_unlock;
1175
1176 for (i = 0; i < htbl->size; i++) {
1177 struct ipoib_neigh *neigh;
1178 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1179
1180 while ((neigh = rcu_dereference_protected(*np,
1181 lockdep_is_held(&ntbl->rwlock))) != NULL) {
1182 rcu_assign_pointer(*np,
1183 rcu_dereference_protected(neigh->hnext,
1184 lockdep_is_held(&ntbl->rwlock)));
1185 /* remove from path/mc list */
1186 spin_lock_irqsave(&priv->lock, flags);
1187 list_del(&neigh->list);
1188 spin_unlock_irqrestore(&priv->lock, flags);
1189 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1190 }
1191 }
1192
1193 rcu_assign_pointer(ntbl->htbl, NULL);
1194 call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1195
1196out_unlock:
1197 write_unlock_bh(&ntbl->rwlock);
1198}
1199
1200static void ipoib_neigh_hash_uninit(struct net_device *dev)
1201{
1202 struct ipoib_dev_priv *priv = netdev_priv(dev);
1203 int stopped;
1204
1205 ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1206 init_completion(&priv->ntbl.flushed);
1207 set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1208
1209 /* Stop GC if called at init fail need to cancel work */
1210 stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1211 if (!stopped)
1212 cancel_delayed_work(&priv->neigh_reap_task);
1213
1214 if (atomic_read(&priv->ntbl.entries)) {
1215 ipoib_flush_neighs(priv);
1216 wait_for_completion(&priv->ntbl.flushed);
1217 }
1218}
1219
1220
924int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) 1221int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
925{ 1222{
926 struct ipoib_dev_priv *priv = netdev_priv(dev); 1223 struct ipoib_dev_priv *priv = netdev_priv(dev);
927 1224
1225 if (ipoib_neigh_hash_init(priv) < 0)
1226 goto out;
928 /* Allocate RX/TX "rings" to hold queued skbs */ 1227 /* Allocate RX/TX "rings" to hold queued skbs */
929 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, 1228 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
930 GFP_KERNEL); 1229 GFP_KERNEL);
931 if (!priv->rx_ring) { 1230 if (!priv->rx_ring) {
932 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", 1231 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
933 ca->name, ipoib_recvq_size); 1232 ca->name, ipoib_recvq_size);
934 goto out; 1233 goto out_neigh_hash_cleanup;
935 } 1234 }
936 1235
937 priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); 1236 priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
@@ -954,6 +1253,8 @@ out_tx_ring_cleanup:
954out_rx_ring_cleanup: 1253out_rx_ring_cleanup:
955 kfree(priv->rx_ring); 1254 kfree(priv->rx_ring);
956 1255
1256out_neigh_hash_cleanup:
1257 ipoib_neigh_hash_uninit(dev);
957out: 1258out:
958 return -ENOMEM; 1259 return -ENOMEM;
959} 1260}
@@ -966,6 +1267,9 @@ void ipoib_dev_cleanup(struct net_device *dev)
966 1267
967 /* Delete any child interfaces first */ 1268 /* Delete any child interfaces first */
968 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { 1269 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1270 /* Stop GC on child */
1271 set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1272 cancel_delayed_work(&cpriv->neigh_reap_task);
969 unregister_netdev(cpriv->dev); 1273 unregister_netdev(cpriv->dev);
970 ipoib_dev_cleanup(cpriv->dev); 1274 ipoib_dev_cleanup(cpriv->dev);
971 free_netdev(cpriv->dev); 1275 free_netdev(cpriv->dev);
@@ -978,6 +1282,8 @@ void ipoib_dev_cleanup(struct net_device *dev)
978 1282
979 priv->rx_ring = NULL; 1283 priv->rx_ring = NULL;
980 priv->tx_ring = NULL; 1284 priv->tx_ring = NULL;
1285
1286 ipoib_neigh_hash_uninit(dev);
981} 1287}
982 1288
983static const struct header_ops ipoib_header_ops = { 1289static const struct header_ops ipoib_header_ops = {
@@ -992,7 +1298,6 @@ static const struct net_device_ops ipoib_netdev_ops = {
992 .ndo_start_xmit = ipoib_start_xmit, 1298 .ndo_start_xmit = ipoib_start_xmit,
993 .ndo_tx_timeout = ipoib_timeout, 1299 .ndo_tx_timeout = ipoib_timeout,
994 .ndo_set_rx_mode = ipoib_set_mcast_list, 1300 .ndo_set_rx_mode = ipoib_set_mcast_list,
995 .ndo_neigh_setup = ipoib_neigh_setup_dev,
996}; 1301};
997 1302
998static void ipoib_setup(struct net_device *dev) 1303static void ipoib_setup(struct net_device *dev)
@@ -1041,6 +1346,7 @@ static void ipoib_setup(struct net_device *dev)
1041 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); 1346 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
1042 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); 1347 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1043 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); 1348 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1349 INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1044} 1350}
1045 1351
1046struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) 1352struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
@@ -1281,6 +1587,9 @@ sysfs_failed:
1281 1587
1282register_failed: 1588register_failed:
1283 ib_unregister_event_handler(&priv->event_handler); 1589 ib_unregister_event_handler(&priv->event_handler);
1590 /* Stop GC if started before flush */
1591 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1592 cancel_delayed_work(&priv->neigh_reap_task);
1284 flush_workqueue(ipoib_workqueue); 1593 flush_workqueue(ipoib_workqueue);
1285 1594
1286event_failed: 1595event_failed:
@@ -1347,6 +1656,9 @@ static void ipoib_remove_one(struct ib_device *device)
1347 dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); 1656 dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1348 rtnl_unlock(); 1657 rtnl_unlock();
1349 1658
1659 /* Stop GC */
1660 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1661 cancel_delayed_work(&priv->neigh_reap_task);
1350 flush_workqueue(ipoib_workqueue); 1662 flush_workqueue(ipoib_workqueue);
1351 1663
1352 unregister_netdev(priv->dev); 1664 unregister_netdev(priv->dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 7cecb16d3d48..13f4aa7593c8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -69,28 +69,13 @@ struct ipoib_mcast_iter {
69static void ipoib_mcast_free(struct ipoib_mcast *mcast) 69static void ipoib_mcast_free(struct ipoib_mcast *mcast)
70{ 70{
71 struct net_device *dev = mcast->dev; 71 struct net_device *dev = mcast->dev;
72 struct ipoib_dev_priv *priv = netdev_priv(dev);
73 struct ipoib_neigh *neigh, *tmp;
74 int tx_dropped = 0; 72 int tx_dropped = 0;
75 73
76 ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", 74 ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n",
77 mcast->mcmember.mgid.raw); 75 mcast->mcmember.mgid.raw);
78 76
79 spin_lock_irq(&priv->lock); 77 /* remove all neigh connected to this mcast */
80 78 ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw);
81 list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
82 /*
83 * It's safe to call ipoib_put_ah() inside priv->lock
84 * here, because we know that mcast->ah will always
85 * hold one more reference, so ipoib_put_ah() will
86 * never do more than decrement the ref count.
87 */
88 if (neigh->ah)
89 ipoib_put_ah(neigh->ah);
90 ipoib_neigh_free(dev, neigh);
91 }
92
93 spin_unlock_irq(&priv->lock);
94 79
95 if (mcast->ah) 80 if (mcast->ah)
96 ipoib_put_ah(mcast->ah); 81 ipoib_put_ah(mcast->ah);
@@ -655,17 +640,12 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
655 return 0; 640 return 0;
656} 641}
657 642
658void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) 643void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
659{ 644{
660 struct ipoib_dev_priv *priv = netdev_priv(dev); 645 struct ipoib_dev_priv *priv = netdev_priv(dev);
661 struct dst_entry *dst = skb_dst(skb);
662 struct ipoib_mcast *mcast; 646 struct ipoib_mcast *mcast;
663 struct neighbour *n;
664 unsigned long flags; 647 unsigned long flags;
665 648 void *mgid = daddr + 4;
666 n = NULL;
667 if (dst)
668 n = dst_neigh_lookup_skb(dst, skb);
669 649
670 spin_lock_irqsave(&priv->lock, flags); 650 spin_lock_irqsave(&priv->lock, flags);
671 651
@@ -721,28 +701,29 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
721 701
722out: 702out:
723 if (mcast && mcast->ah) { 703 if (mcast && mcast->ah) {
724 if (n) { 704 struct ipoib_neigh *neigh;
725 if (!*to_ipoib_neigh(n)) { 705
726 struct ipoib_neigh *neigh; 706 spin_unlock_irqrestore(&priv->lock, flags);
727 707 neigh = ipoib_neigh_get(dev, daddr);
728 neigh = ipoib_neigh_alloc(n, skb->dev); 708 spin_lock_irqsave(&priv->lock, flags);
729 if (neigh) { 709 if (!neigh) {
730 kref_get(&mcast->ah->ref); 710 spin_unlock_irqrestore(&priv->lock, flags);
731 neigh->ah = mcast->ah; 711 neigh = ipoib_neigh_alloc(daddr, dev);
732 list_add_tail(&neigh->list, 712 spin_lock_irqsave(&priv->lock, flags);
733 &mcast->neigh_list); 713 if (neigh) {
734 } 714 kref_get(&mcast->ah->ref);
715 neigh->ah = mcast->ah;
716 list_add_tail(&neigh->list, &mcast->neigh_list);
735 } 717 }
736 neigh_release(n);
737 } 718 }
738 spin_unlock_irqrestore(&priv->lock, flags); 719 spin_unlock_irqrestore(&priv->lock, flags);
739 ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); 720 ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
721 if (neigh)
722 ipoib_neigh_put(neigh);
740 return; 723 return;
741 } 724 }
742 725
743unlock: 726unlock:
744 if (n)
745 neigh_release(n);
746 spin_unlock_irqrestore(&priv->lock, flags); 727 spin_unlock_irqrestore(&priv->lock, flags);
747} 728}
748 729