diff options
author | Moni Shoua <monis@Voltaire.COM> | 2008-07-15 02:48:49 -0400 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2008-07-15 02:48:49 -0400 |
commit | ee1e2c82c245a5fb2864e9dbcdaab3390fde3fcc (patch) | |
tree | 2bd6686dcee9524352c1afce3cb772373ec83d5f /drivers/infiniband/ulp/ipoib/ipoib_main.c | |
parent | 038919f29682b00ea95506e959210fc72d1aaf64 (diff) |
IPoIB: Refresh paths instead of flushing them on SM change events
The patch tries to solve the problem of device going down and paths being
flushed on an SM change event. The method is to mark the paths as candidates for
refresh (by setting the new valid flag to 0), and wait for an ARP
probe a new path record query.
The solution requires a different and less intrusive handling of SM
change event. For that, the second argument of the flush function
changes its meaning from a boolean flag to a level. In most cases, SM
failover doesn't cause LID change so traffic won't stop. In the rare
cases of LID change, the remote host (the one that hadn't changed its
LID) will lose connectivity until paths are refreshed. This is no
worse than the current state. In fact, preventing the device from
going down saves packets that otherwise would be lost.
Signed-off-by: Moni Levy <monil@voltaire.com>
Signed-off-by: Moni Shoua <monis@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/ulp/ipoib/ipoib_main.c')
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_main.c | 44 |
1 files changed, 40 insertions, 4 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index fead88f7fb17..b3fd7e8333cf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c | |||
@@ -357,6 +357,23 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter, | |||
357 | 357 | ||
358 | #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ | 358 | #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ |
359 | 359 | ||
360 | void ipoib_mark_paths_invalid(struct net_device *dev) | ||
361 | { | ||
362 | struct ipoib_dev_priv *priv = netdev_priv(dev); | ||
363 | struct ipoib_path *path, *tp; | ||
364 | |||
365 | spin_lock_irq(&priv->lock); | ||
366 | |||
367 | list_for_each_entry_safe(path, tp, &priv->path_list, list) { | ||
368 | ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n", | ||
369 | be16_to_cpu(path->pathrec.dlid), | ||
370 | IPOIB_GID_ARG(path->pathrec.dgid)); | ||
371 | path->valid = 0; | ||
372 | } | ||
373 | |||
374 | spin_unlock_irq(&priv->lock); | ||
375 | } | ||
376 | |||
360 | void ipoib_flush_paths(struct net_device *dev) | 377 | void ipoib_flush_paths(struct net_device *dev) |
361 | { | 378 | { |
362 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 379 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
@@ -393,6 +410,7 @@ static void path_rec_completion(int status, | |||
393 | struct net_device *dev = path->dev; | 410 | struct net_device *dev = path->dev; |
394 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 411 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
395 | struct ipoib_ah *ah = NULL; | 412 | struct ipoib_ah *ah = NULL; |
413 | struct ipoib_ah *old_ah; | ||
396 | struct ipoib_neigh *neigh, *tn; | 414 | struct ipoib_neigh *neigh, *tn; |
397 | struct sk_buff_head skqueue; | 415 | struct sk_buff_head skqueue; |
398 | struct sk_buff *skb; | 416 | struct sk_buff *skb; |
@@ -416,6 +434,7 @@ static void path_rec_completion(int status, | |||
416 | 434 | ||
417 | spin_lock_irqsave(&priv->lock, flags); | 435 | spin_lock_irqsave(&priv->lock, flags); |
418 | 436 | ||
437 | old_ah = path->ah; | ||
419 | path->ah = ah; | 438 | path->ah = ah; |
420 | 439 | ||
421 | if (ah) { | 440 | if (ah) { |
@@ -428,6 +447,17 @@ static void path_rec_completion(int status, | |||
428 | __skb_queue_tail(&skqueue, skb); | 447 | __skb_queue_tail(&skqueue, skb); |
429 | 448 | ||
430 | list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { | 449 | list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { |
450 | if (neigh->ah) { | ||
451 | WARN_ON(neigh->ah != old_ah); | ||
452 | /* | ||
453 | * Dropping the ah reference inside | ||
454 | * priv->lock is safe here, because we | ||
455 | * will hold one more reference from | ||
456 | * the original value of path->ah (ie | ||
457 | * old_ah). | ||
458 | */ | ||
459 | ipoib_put_ah(neigh->ah); | ||
460 | } | ||
431 | kref_get(&path->ah->ref); | 461 | kref_get(&path->ah->ref); |
432 | neigh->ah = path->ah; | 462 | neigh->ah = path->ah; |
433 | memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, | 463 | memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, |
@@ -450,6 +480,7 @@ static void path_rec_completion(int status, | |||
450 | while ((skb = __skb_dequeue(&neigh->queue))) | 480 | while ((skb = __skb_dequeue(&neigh->queue))) |
451 | __skb_queue_tail(&skqueue, skb); | 481 | __skb_queue_tail(&skqueue, skb); |
452 | } | 482 | } |
483 | path->valid = 1; | ||
453 | } | 484 | } |
454 | 485 | ||
455 | path->query = NULL; | 486 | path->query = NULL; |
@@ -457,6 +488,9 @@ static void path_rec_completion(int status, | |||
457 | 488 | ||
458 | spin_unlock_irqrestore(&priv->lock, flags); | 489 | spin_unlock_irqrestore(&priv->lock, flags); |
459 | 490 | ||
491 | if (old_ah) | ||
492 | ipoib_put_ah(old_ah); | ||
493 | |||
460 | while ((skb = __skb_dequeue(&skqueue))) { | 494 | while ((skb = __skb_dequeue(&skqueue))) { |
461 | skb->dev = dev; | 495 | skb->dev = dev; |
462 | if (dev_queue_xmit(skb)) | 496 | if (dev_queue_xmit(skb)) |
@@ -630,8 +664,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, | |||
630 | spin_lock(&priv->lock); | 664 | spin_lock(&priv->lock); |
631 | 665 | ||
632 | path = __path_find(dev, phdr->hwaddr + 4); | 666 | path = __path_find(dev, phdr->hwaddr + 4); |
633 | if (!path) { | 667 | if (!path || !path->valid) { |
634 | path = path_rec_create(dev, phdr->hwaddr + 4); | 668 | if (!path) |
669 | path = path_rec_create(dev, phdr->hwaddr + 4); | ||
635 | if (path) { | 670 | if (path) { |
636 | /* put pseudoheader back on for next time */ | 671 | /* put pseudoheader back on for next time */ |
637 | skb_push(skb, sizeof *phdr); | 672 | skb_push(skb, sizeof *phdr); |
@@ -1046,9 +1081,10 @@ static void ipoib_setup(struct net_device *dev) | |||
1046 | INIT_LIST_HEAD(&priv->multicast_list); | 1081 | INIT_LIST_HEAD(&priv->multicast_list); |
1047 | 1082 | ||
1048 | INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); | 1083 | INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); |
1049 | INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event); | ||
1050 | INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); | 1084 | INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); |
1051 | INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush); | 1085 | INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); |
1086 | INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); | ||
1087 | INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); | ||
1052 | INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); | 1088 | INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); |
1053 | INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); | 1089 | INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); |
1054 | } | 1090 | } |