aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMoni Shoua <monis@Voltaire.COM>2008-07-15 02:48:49 -0400
committerRoland Dreier <rolandd@cisco.com>2008-07-15 02:48:49 -0400
commitee1e2c82c245a5fb2864e9dbcdaab3390fde3fcc (patch)
tree2bd6686dcee9524352c1afce3cb772373ec83d5f
parent038919f29682b00ea95506e959210fc72d1aaf64 (diff)
IPoIB: Refresh paths instead of flushing them on SM change events
The patch tries to solve the problem of device going down and paths being flushed on an SM change event. The method is to mark the paths as candidates for refresh (by setting the new valid flag to 0), and wait for an ARP probe a new path record query. The solution requires a different and less intrusive handling of SM change event. For that, the second argument of the flush function changes its meaning from a boolean flag to a level. In most cases, SM failover doesn't cause LID change so traffic won't stop. In the rare cases of LID change, the remote host (the one that hadn't changed its LID) will lose connectivity until paths are refreshed. This is no worse than the current state. In fact, preventing the device from going down saves packets that otherwise would be lost. Signed-off-by: Moni Levy <monil@voltaire.com> Signed-off-by: Moni Shoua <monis@voltaire.com> Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h17
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c42
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c44
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c18
4 files changed, 91 insertions, 30 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 2c522572e3c5..bb19587c5eaf 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -54,6 +54,12 @@
54 54
55/* constants */ 55/* constants */
56 56
57enum ipoib_flush_level {
58 IPOIB_FLUSH_LIGHT,
59 IPOIB_FLUSH_NORMAL,
60 IPOIB_FLUSH_HEAVY
61};
62
57enum { 63enum {
58 IPOIB_ENCAP_LEN = 4, 64 IPOIB_ENCAP_LEN = 4,
59 65
@@ -284,10 +290,11 @@ struct ipoib_dev_priv {
284 290
285 struct delayed_work pkey_poll_task; 291 struct delayed_work pkey_poll_task;
286 struct delayed_work mcast_task; 292 struct delayed_work mcast_task;
287 struct work_struct flush_task; 293 struct work_struct flush_light;
294 struct work_struct flush_normal;
295 struct work_struct flush_heavy;
288 struct work_struct restart_task; 296 struct work_struct restart_task;
289 struct delayed_work ah_reap_task; 297 struct delayed_work ah_reap_task;
290 struct work_struct pkey_event_task;
291 298
292 struct ib_device *ca; 299 struct ib_device *ca;
293 u8 port; 300 u8 port;
@@ -369,6 +376,7 @@ struct ipoib_path {
369 376
370 struct rb_node rb_node; 377 struct rb_node rb_node;
371 struct list_head list; 378 struct list_head list;
379 int valid;
372}; 380};
373 381
374struct ipoib_neigh { 382struct ipoib_neigh {
@@ -433,11 +441,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
433 struct ipoib_ah *address, u32 qpn); 441 struct ipoib_ah *address, u32 qpn);
434void ipoib_reap_ah(struct work_struct *work); 442void ipoib_reap_ah(struct work_struct *work);
435 443
444void ipoib_mark_paths_invalid(struct net_device *dev);
436void ipoib_flush_paths(struct net_device *dev); 445void ipoib_flush_paths(struct net_device *dev);
437struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); 446struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
438 447
439int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); 448int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
440void ipoib_ib_dev_flush(struct work_struct *work); 449void ipoib_ib_dev_flush_light(struct work_struct *work);
450void ipoib_ib_dev_flush_normal(struct work_struct *work);
451void ipoib_ib_dev_flush_heavy(struct work_struct *work);
441void ipoib_pkey_event(struct work_struct *work); 452void ipoib_pkey_event(struct work_struct *work);
442void ipoib_ib_dev_cleanup(struct net_device *dev); 453void ipoib_ib_dev_cleanup(struct net_device *dev);
443 454
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 5d50e5261eed..66cafa20c246 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -902,7 +902,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
902 return 0; 902 return 0;
903} 903}
904 904
905static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) 905static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
906 enum ipoib_flush_level level)
906{ 907{
907 struct ipoib_dev_priv *cpriv; 908 struct ipoib_dev_priv *cpriv;
908 struct net_device *dev = priv->dev; 909 struct net_device *dev = priv->dev;
@@ -915,7 +916,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
915 * the parent is down. 916 * the parent is down.
916 */ 917 */
917 list_for_each_entry(cpriv, &priv->child_intfs, list) 918 list_for_each_entry(cpriv, &priv->child_intfs, list)
918 __ipoib_ib_dev_flush(cpriv, pkey_event); 919 __ipoib_ib_dev_flush(cpriv, level);
919 920
920 mutex_unlock(&priv->vlan_mutex); 921 mutex_unlock(&priv->vlan_mutex);
921 922
@@ -929,7 +930,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
929 return; 930 return;
930 } 931 }
931 932
932 if (pkey_event) { 933 if (level == IPOIB_FLUSH_HEAVY) {
933 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { 934 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
934 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 935 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
935 ipoib_ib_dev_down(dev, 0); 936 ipoib_ib_dev_down(dev, 0);
@@ -947,11 +948,15 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
947 priv->pkey_index = new_index; 948 priv->pkey_index = new_index;
948 } 949 }
949 950
950 ipoib_dbg(priv, "flushing\n"); 951 if (level == IPOIB_FLUSH_LIGHT) {
952 ipoib_mark_paths_invalid(dev);
953 ipoib_mcast_dev_flush(dev);
954 }
951 955
952 ipoib_ib_dev_down(dev, 0); 956 if (level >= IPOIB_FLUSH_NORMAL)
957 ipoib_ib_dev_down(dev, 0);
953 958
954 if (pkey_event) { 959 if (level == IPOIB_FLUSH_HEAVY) {
955 ipoib_ib_dev_stop(dev, 0); 960 ipoib_ib_dev_stop(dev, 0);
956 ipoib_ib_dev_open(dev); 961 ipoib_ib_dev_open(dev);
957 } 962 }
@@ -961,27 +966,34 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
961 * we get here, don't bring it back up if it's not configured up 966 * we get here, don't bring it back up if it's not configured up
962 */ 967 */
963 if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { 968 if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
964 ipoib_ib_dev_up(dev); 969 if (level >= IPOIB_FLUSH_NORMAL)
970 ipoib_ib_dev_up(dev);
965 ipoib_mcast_restart_task(&priv->restart_task); 971 ipoib_mcast_restart_task(&priv->restart_task);
966 } 972 }
967} 973}
968 974
969void ipoib_ib_dev_flush(struct work_struct *work) 975void ipoib_ib_dev_flush_light(struct work_struct *work)
976{
977 struct ipoib_dev_priv *priv =
978 container_of(work, struct ipoib_dev_priv, flush_light);
979
980 __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
981}
982
983void ipoib_ib_dev_flush_normal(struct work_struct *work)
970{ 984{
971 struct ipoib_dev_priv *priv = 985 struct ipoib_dev_priv *priv =
972 container_of(work, struct ipoib_dev_priv, flush_task); 986 container_of(work, struct ipoib_dev_priv, flush_normal);
973 987
974 ipoib_dbg(priv, "Flushing %s\n", priv->dev->name); 988 __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
975 __ipoib_ib_dev_flush(priv, 0);
976} 989}
977 990
978void ipoib_pkey_event(struct work_struct *work) 991void ipoib_ib_dev_flush_heavy(struct work_struct *work)
979{ 992{
980 struct ipoib_dev_priv *priv = 993 struct ipoib_dev_priv *priv =
981 container_of(work, struct ipoib_dev_priv, pkey_event_task); 994 container_of(work, struct ipoib_dev_priv, flush_heavy);
982 995
983 ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name); 996 __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
984 __ipoib_ib_dev_flush(priv, 1);
985} 997}
986 998
987void ipoib_ib_dev_cleanup(struct net_device *dev) 999void ipoib_ib_dev_cleanup(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index fead88f7fb17..b3fd7e8333cf 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -357,6 +357,23 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
357 357
358#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ 358#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
359 359
360void ipoib_mark_paths_invalid(struct net_device *dev)
361{
362 struct ipoib_dev_priv *priv = netdev_priv(dev);
363 struct ipoib_path *path, *tp;
364
365 spin_lock_irq(&priv->lock);
366
367 list_for_each_entry_safe(path, tp, &priv->path_list, list) {
368 ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n",
369 be16_to_cpu(path->pathrec.dlid),
370 IPOIB_GID_ARG(path->pathrec.dgid));
371 path->valid = 0;
372 }
373
374 spin_unlock_irq(&priv->lock);
375}
376
360void ipoib_flush_paths(struct net_device *dev) 377void ipoib_flush_paths(struct net_device *dev)
361{ 378{
362 struct ipoib_dev_priv *priv = netdev_priv(dev); 379 struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -393,6 +410,7 @@ static void path_rec_completion(int status,
393 struct net_device *dev = path->dev; 410 struct net_device *dev = path->dev;
394 struct ipoib_dev_priv *priv = netdev_priv(dev); 411 struct ipoib_dev_priv *priv = netdev_priv(dev);
395 struct ipoib_ah *ah = NULL; 412 struct ipoib_ah *ah = NULL;
413 struct ipoib_ah *old_ah;
396 struct ipoib_neigh *neigh, *tn; 414 struct ipoib_neigh *neigh, *tn;
397 struct sk_buff_head skqueue; 415 struct sk_buff_head skqueue;
398 struct sk_buff *skb; 416 struct sk_buff *skb;
@@ -416,6 +434,7 @@ static void path_rec_completion(int status,
416 434
417 spin_lock_irqsave(&priv->lock, flags); 435 spin_lock_irqsave(&priv->lock, flags);
418 436
437 old_ah = path->ah;
419 path->ah = ah; 438 path->ah = ah;
420 439
421 if (ah) { 440 if (ah) {
@@ -428,6 +447,17 @@ static void path_rec_completion(int status,
428 __skb_queue_tail(&skqueue, skb); 447 __skb_queue_tail(&skqueue, skb);
429 448
430 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { 449 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
450 if (neigh->ah) {
451 WARN_ON(neigh->ah != old_ah);
452 /*
453 * Dropping the ah reference inside
454 * priv->lock is safe here, because we
455 * will hold one more reference from
456 * the original value of path->ah (ie
457 * old_ah).
458 */
459 ipoib_put_ah(neigh->ah);
460 }
431 kref_get(&path->ah->ref); 461 kref_get(&path->ah->ref);
432 neigh->ah = path->ah; 462 neigh->ah = path->ah;
433 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 463 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
@@ -450,6 +480,7 @@ static void path_rec_completion(int status,
450 while ((skb = __skb_dequeue(&neigh->queue))) 480 while ((skb = __skb_dequeue(&neigh->queue)))
451 __skb_queue_tail(&skqueue, skb); 481 __skb_queue_tail(&skqueue, skb);
452 } 482 }
483 path->valid = 1;
453 } 484 }
454 485
455 path->query = NULL; 486 path->query = NULL;
@@ -457,6 +488,9 @@ static void path_rec_completion(int status,
457 488
458 spin_unlock_irqrestore(&priv->lock, flags); 489 spin_unlock_irqrestore(&priv->lock, flags);
459 490
491 if (old_ah)
492 ipoib_put_ah(old_ah);
493
460 while ((skb = __skb_dequeue(&skqueue))) { 494 while ((skb = __skb_dequeue(&skqueue))) {
461 skb->dev = dev; 495 skb->dev = dev;
462 if (dev_queue_xmit(skb)) 496 if (dev_queue_xmit(skb))
@@ -630,8 +664,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
630 spin_lock(&priv->lock); 664 spin_lock(&priv->lock);
631 665
632 path = __path_find(dev, phdr->hwaddr + 4); 666 path = __path_find(dev, phdr->hwaddr + 4);
633 if (!path) { 667 if (!path || !path->valid) {
634 path = path_rec_create(dev, phdr->hwaddr + 4); 668 if (!path)
669 path = path_rec_create(dev, phdr->hwaddr + 4);
635 if (path) { 670 if (path) {
636 /* put pseudoheader back on for next time */ 671 /* put pseudoheader back on for next time */
637 skb_push(skb, sizeof *phdr); 672 skb_push(skb, sizeof *phdr);
@@ -1046,9 +1081,10 @@ static void ipoib_setup(struct net_device *dev)
1046 INIT_LIST_HEAD(&priv->multicast_list); 1081 INIT_LIST_HEAD(&priv->multicast_list);
1047 1082
1048 INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); 1083 INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
1049 INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
1050 INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); 1084 INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
1051 INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush); 1085 INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
1086 INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
1087 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
1052 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); 1088 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1053 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); 1089 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1054} 1090}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 7b8fa36f509b..96f9aa79cbbe 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -290,15 +290,17 @@ void ipoib_event(struct ib_event_handler *handler,
290 if (record->element.port_num != priv->port) 290 if (record->element.port_num != priv->port)
291 return; 291 return;
292 292
293 if (record->event == IB_EVENT_PORT_ERR || 293 ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
294 record->event == IB_EVENT_PORT_ACTIVE || 294 record->device->name, record->element.port_num);
295 record->event == IB_EVENT_LID_CHANGE || 295
296 record->event == IB_EVENT_SM_CHANGE || 296 if (record->event == IB_EVENT_SM_CHANGE ||
297 record->event == IB_EVENT_CLIENT_REREGISTER) { 297 record->event == IB_EVENT_CLIENT_REREGISTER) {
298 ipoib_dbg(priv, "Port state change event\n"); 298 queue_work(ipoib_workqueue, &priv->flush_light);
299 queue_work(ipoib_workqueue, &priv->flush_task); 299 } else if (record->event == IB_EVENT_PORT_ERR ||
300 record->event == IB_EVENT_PORT_ACTIVE ||
301 record->event == IB_EVENT_LID_CHANGE) {
302 queue_work(ipoib_workqueue, &priv->flush_normal);
300 } else if (record->event == IB_EVENT_PKEY_CHANGE) { 303 } else if (record->event == IB_EVENT_PKEY_CHANGE) {
301 ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port); 304 queue_work(ipoib_workqueue, &priv->flush_heavy);
302 queue_work(ipoib_workqueue, &priv->pkey_event_task);
303 } 305 }
304} 306}