aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDoug Ledford <dledford@redhat.com>2015-02-21 19:27:05 -0500
committerDoug Ledford <dledford@redhat.com>2015-04-15 16:06:18 -0400
commit69911416d87d6673c48d23a9fbc060e85f41fc73 (patch)
treeb58e383e3a4cb3932039d61078f3ab28692d0224
parentefc82eeeae4ece716091d8540079b7f276ca1ad5 (diff)
IB/ipoib: fix MCAST_FLAG_BUSY usage
Commit a9c8ba5884 ("IPoIB: Fix usage of uninitialized multicast objects") added a new flag MCAST_JOIN_STARTED, but was not very strict in how it was used. We didn't always initialize the completion struct before we set the flag, and we didn't always call complete on the completion struct from all paths that complete it. And when we did complete it, sometimes we continued to touch the mcast entry after the completion, opening us up to possible use after free issues. This made it less than totally effective, and certainly made its use confusing. And in the flush function we would use the presence of this flag to signal that we should wait on the completion struct, but we never cleared this flag, ever. In order to make things clearer and aid in resolving the rtnl deadlock bug I've been chasing, I cleaned this up a bit. 1) Remove the MCAST_JOIN_STARTED flag entirely 2) Change MCAST_FLAG_BUSY so it now only means a join is in-flight 3) Test mcast->mc directly to see if we have completed ib_sa_join_multicast (using IS_ERR_OR_NULL) 4) Make sure that before setting MCAST_FLAG_BUSY we always initialize the mcast->done completion struct 5) Make sure that before calling complete(&mcast->done), we always clear the MCAST_FLAG_BUSY bit 6) Take the mcast_mutex before we call ib_sa_multicast_join and also take the mutex in our join callback. This forces ib_sa_multicast_join to return and set mcast->mc before we process the callback. This way, our callback can safely clear mcast->mc if there is an error on the join and we will do the right thing as a result in mcast_dev_flush. 7) Because we need the mutex to synchronize mcast->mc, we can no longer call mcast_sendonly_join directly from mcast_send and instead must add sendonly join processing to the mcast_join_task 8) Make MCAST_RUN mean that we have a working mcast subsystem, not that we have a running task. We know when we need to reschedule our join task thread and don't need a flag to tell us. 9) Add a helper for rescheduling the join task thread A number of different races are resolved with these changes. These races existed with the old MCAST_FLAG_BUSY usage, the MCAST_JOIN_STARTED flag was an attempt to address them, and while it helped, a determined effort could still trip things up. One race looks something like this: Thread 1 Thread 2 ib_sa_join_multicast (as part of running restart mcast task) alloc member call callback ifconfig ib0 down wait_for_completion callback call completes wait_for_completion in mcast_dev_flush completes mcast->mc is PTR_ERR_OR_NULL so we skip ib_sa_leave_multicast return from callback return from ib_sa_join_multicast set mcast->mc = return from ib_sa_multicast We now have a permanently unbalanced join/leave issue that trips up the refcounting in core/multicast.c Another like this: Thread 1 Thread 2 Thread 3 ib_sa_multicast_join ifconfig ib0 down priv->broadcast = NULL join_complete wait_for_completion mcast->mc is not yet set, so don't clear return from ib_sa_join_multicast and set mcast->mc complete return -EAGAIN (making mcast->mc invalid) call ib_sa_multicast_leave on invalid mcast->mc, hang forever By holding the mutex around ib_sa_multicast_join and taking the mutex early in the callback, we force mcast->mc to be valid at the time we run the callback. This allows us to clear mcast->mc if there is an error and the join is going to fail. We do this before we complete the mcast. In this way, mcast_dev_flush always sees consistent state in regards to mcast->mc membership at the time that the wait_for_completion() returns. Signed-off-by: Doug Ledford <dledford@redhat.com>
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h11
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c355
2 files changed, 238 insertions, 128 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 9ef432ae72e8..c79dcd5ee8ad 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -98,9 +98,15 @@ enum {
98 98
99 IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ 99 IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */
100 IPOIB_MCAST_FLAG_SENDONLY = 1, 100 IPOIB_MCAST_FLAG_SENDONLY = 1,
101 IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ 101 /*
102 * For IPOIB_MCAST_FLAG_BUSY
103 * When set, in flight join and mcast->mc is unreliable
104 * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
105 * haven't started yet
106 * When clear and mcast->mc is valid pointer, join was successful
107 */
108 IPOIB_MCAST_FLAG_BUSY = 2,
102 IPOIB_MCAST_FLAG_ATTACHED = 3, 109 IPOIB_MCAST_FLAG_ATTACHED = 3,
103 IPOIB_MCAST_JOIN_STARTED = 4,
104 110
105 MAX_SEND_CQE = 16, 111 MAX_SEND_CQE = 16,
106 IPOIB_CM_COPYBREAK = 256, 112 IPOIB_CM_COPYBREAK = 256,
@@ -148,6 +154,7 @@ struct ipoib_mcast {
148 154
149 unsigned long created; 155 unsigned long created;
150 unsigned long backoff; 156 unsigned long backoff;
157 unsigned long delay_until;
151 158
152 unsigned long flags; 159 unsigned long flags;
153 unsigned char logcount; 160 unsigned char logcount;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index bb1b69904f96..277e7ac7c4db 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -66,6 +66,48 @@ struct ipoib_mcast_iter {
66 unsigned int send_only; 66 unsigned int send_only;
67}; 67};
68 68
69/*
70 * This should be called with the mcast_mutex held
71 */
72static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
73 struct ipoib_mcast *mcast,
74 bool delay)
75{
76 if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
77 return;
78
79 /*
80 * We will be scheduling *something*, so cancel whatever is
81 * currently scheduled first
82 */
83 cancel_delayed_work(&priv->mcast_task);
84 if (mcast && delay) {
85 /*
86 * We had a failure and want to schedule a retry later
87 */
88 mcast->backoff *= 2;
89 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
90 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
91 mcast->delay_until = jiffies + (mcast->backoff * HZ);
92 /*
93 * Mark this mcast for its delay, but restart the
94 * task immediately. The join task will make sure to
95 * clear out all entries without delays, and then
96 * schedule itself to run again when the earliest
97 * delay expires
98 */
99 queue_delayed_work(priv->wq, &priv->mcast_task, 0);
100 } else if (delay) {
101 /*
102 * Special case of retrying after a failure to
103 * allocate the broadcast multicast group, wait
104 * 1 second and try again
105 */
106 queue_delayed_work(priv->wq, &priv->mcast_task, HZ);
107 } else
108 queue_delayed_work(priv->wq, &priv->mcast_task, 0);
109}
110
69static void ipoib_mcast_free(struct ipoib_mcast *mcast) 111static void ipoib_mcast_free(struct ipoib_mcast *mcast)
70{ 112{
71 struct net_device *dev = mcast->dev; 113 struct net_device *dev = mcast->dev;
@@ -103,6 +145,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
103 145
104 mcast->dev = dev; 146 mcast->dev = dev;
105 mcast->created = jiffies; 147 mcast->created = jiffies;
148 mcast->delay_until = jiffies;
106 mcast->backoff = 1; 149 mcast->backoff = 1;
107 150
108 INIT_LIST_HEAD(&mcast->list); 151 INIT_LIST_HEAD(&mcast->list);
@@ -270,17 +313,31 @@ ipoib_mcast_sendonly_join_complete(int status,
270{ 313{
271 struct ipoib_mcast *mcast = multicast->context; 314 struct ipoib_mcast *mcast = multicast->context;
272 struct net_device *dev = mcast->dev; 315 struct net_device *dev = mcast->dev;
316 struct ipoib_dev_priv *priv = netdev_priv(dev);
317
318 /*
319 * We have to take the mutex to force mcast_sendonly_join to
320 * return from ib_sa_multicast_join and set mcast->mc to a
321 * valid value. Otherwise we were racing with ourselves in
322 * that we might fail here, but get a valid return from
323 * ib_sa_multicast_join after we had cleared mcast->mc here,
324 * resulting in mis-matched joins and leaves and a deadlock
325 */
326 mutex_lock(&mcast_mutex);
273 327
274 /* We trap for port events ourselves. */ 328 /* We trap for port events ourselves. */
275 if (status == -ENETRESET) 329 if (status == -ENETRESET) {
276 return 0; 330 status = 0;
331 goto out;
332 }
277 333
278 if (!status) 334 if (!status)
279 status = ipoib_mcast_join_finish(mcast, &multicast->rec); 335 status = ipoib_mcast_join_finish(mcast, &multicast->rec);
280 336
281 if (status) { 337 if (status) {
282 if (mcast->logcount++ < 20) 338 if (mcast->logcount++ < 20)
283 ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", 339 ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast "
340 "join failed for %pI6, status %d\n",
284 mcast->mcmember.mgid.raw, status); 341 mcast->mcmember.mgid.raw, status);
285 342
286 /* Flush out any queued packets */ 343 /* Flush out any queued packets */
@@ -290,11 +347,18 @@ ipoib_mcast_sendonly_join_complete(int status,
290 dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); 347 dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
291 } 348 }
292 netif_tx_unlock_bh(dev); 349 netif_tx_unlock_bh(dev);
293 350 __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
294 /* Clear the busy flag so we try again */ 351 } else {
295 status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, 352 mcast->backoff = 1;
296 &mcast->flags); 353 mcast->delay_until = jiffies;
354 __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
297 } 355 }
356out:
357 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
358 if (status)
359 mcast->mc = NULL;
360 complete(&mcast->done);
361 mutex_unlock(&mcast_mutex);
298 return status; 362 return status;
299} 363}
300 364
@@ -312,19 +376,18 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
312 int ret = 0; 376 int ret = 0;
313 377
314 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { 378 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
315 ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); 379 ipoib_dbg_mcast(priv, "device shutting down, no sendonly "
380 "multicast joins\n");
381 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
382 complete(&mcast->done);
316 return -ENODEV; 383 return -ENODEV;
317 } 384 }
318 385
319 if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
320 ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
321 return -EBUSY;
322 }
323
324 rec.mgid = mcast->mcmember.mgid; 386 rec.mgid = mcast->mcmember.mgid;
325 rec.port_gid = priv->local_gid; 387 rec.port_gid = priv->local_gid;
326 rec.pkey = cpu_to_be16(priv->pkey); 388 rec.pkey = cpu_to_be16(priv->pkey);
327 389
390 mutex_lock(&mcast_mutex);
328 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, 391 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
329 priv->port, &rec, 392 priv->port, &rec,
330 IB_SA_MCMEMBER_REC_MGID | 393 IB_SA_MCMEMBER_REC_MGID |
@@ -337,12 +400,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
337 if (IS_ERR(mcast->mc)) { 400 if (IS_ERR(mcast->mc)) {
338 ret = PTR_ERR(mcast->mc); 401 ret = PTR_ERR(mcast->mc);
339 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 402 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
340 ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", 403 ipoib_warn(priv, "ib_sa_join_multicast for sendonly join "
341 ret); 404 "failed (ret = %d)\n", ret);
405 complete(&mcast->done);
342 } else { 406 } else {
343 ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", 407 ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting "
344 mcast->mcmember.mgid.raw); 408 "sendonly join\n", mcast->mcmember.mgid.raw);
345 } 409 }
410 mutex_unlock(&mcast_mutex);
346 411
347 return ret; 412 return ret;
348} 413}
@@ -390,6 +455,16 @@ static int ipoib_mcast_join_complete(int status,
390 ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", 455 ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
391 mcast->mcmember.mgid.raw, status); 456 mcast->mcmember.mgid.raw, status);
392 457
458 /*
459 * We have to take the mutex to force mcast_join to
460 * return from ib_sa_multicast_join and set mcast->mc to a
461 * valid value. Otherwise we were racing with ourselves in
462 * that we might fail here, but get a valid return from
463 * ib_sa_multicast_join after we had cleared mcast->mc here,
464 * resulting in mis-matched joins and leaves and a deadlock
465 */
466 mutex_lock(&mcast_mutex);
467
393 /* We trap for port events ourselves. */ 468 /* We trap for port events ourselves. */
394 if (status == -ENETRESET) { 469 if (status == -ENETRESET) {
395 status = 0; 470 status = 0;
@@ -401,10 +476,8 @@ static int ipoib_mcast_join_complete(int status,
401 476
402 if (!status) { 477 if (!status) {
403 mcast->backoff = 1; 478 mcast->backoff = 1;
404 mutex_lock(&mcast_mutex); 479 mcast->delay_until = jiffies;
405 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) 480 __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
406 queue_delayed_work(priv->wq, &priv->mcast_task, 0);
407 mutex_unlock(&mcast_mutex);
408 481
409 /* 482 /*
410 * Defer carrier on work to priv->wq to avoid a 483 * Defer carrier on work to priv->wq to avoid a
@@ -412,37 +485,26 @@ static int ipoib_mcast_join_complete(int status,
412 */ 485 */
413 if (mcast == priv->broadcast) 486 if (mcast == priv->broadcast)
414 queue_work(priv->wq, &priv->carrier_on_task); 487 queue_work(priv->wq, &priv->carrier_on_task);
415 488 } else {
416 status = 0; 489 if (mcast->logcount++ < 20) {
417 goto out; 490 if (status == -ETIMEDOUT || status == -EAGAIN) {
418 } 491 ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
419 492 mcast->mcmember.mgid.raw, status);
420 if (mcast->logcount++ < 20) { 493 } else {
421 if (status == -ETIMEDOUT || status == -EAGAIN) { 494 ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
422 ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", 495 mcast->mcmember.mgid.raw, status);
423 mcast->mcmember.mgid.raw, status); 496 }
424 } else {
425 ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
426 mcast->mcmember.mgid.raw, status);
427 } 497 }
428 }
429
430 mcast->backoff *= 2;
431 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
432 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
433 498
434 /* Clear the busy flag so we try again */ 499 /* Requeue this join task with a backoff delay */
435 status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 500 __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
436 501 }
437 mutex_lock(&mcast_mutex);
438 spin_lock_irq(&priv->lock);
439 if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
440 queue_delayed_work(priv->wq, &priv->mcast_task,
441 mcast->backoff * HZ);
442 spin_unlock_irq(&priv->lock);
443 mutex_unlock(&mcast_mutex);
444out: 502out:
503 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
504 if (status)
505 mcast->mc = NULL;
445 complete(&mcast->done); 506 complete(&mcast->done);
507 mutex_unlock(&mcast_mutex);
446 return status; 508 return status;
447} 509}
448 510
@@ -491,29 +553,18 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
491 rec.hop_limit = priv->broadcast->mcmember.hop_limit; 553 rec.hop_limit = priv->broadcast->mcmember.hop_limit;
492 } 554 }
493 555
494 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 556 mutex_lock(&mcast_mutex);
495 init_completion(&mcast->done);
496 set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
497
498 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, 557 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
499 &rec, comp_mask, GFP_KERNEL, 558 &rec, comp_mask, GFP_KERNEL,
500 ipoib_mcast_join_complete, mcast); 559 ipoib_mcast_join_complete, mcast);
501 if (IS_ERR(mcast->mc)) { 560 if (IS_ERR(mcast->mc)) {
502 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 561 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
503 complete(&mcast->done);
504 ret = PTR_ERR(mcast->mc); 562 ret = PTR_ERR(mcast->mc);
505 ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); 563 ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
506 564 __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
507 mcast->backoff *= 2; 565 complete(&mcast->done);
508 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
509 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
510
511 mutex_lock(&mcast_mutex);
512 if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
513 queue_delayed_work(priv->wq, &priv->mcast_task,
514 mcast->backoff * HZ);
515 mutex_unlock(&mcast_mutex);
516 } 566 }
567 mutex_unlock(&mcast_mutex);
517} 568}
518 569
519void ipoib_mcast_join_task(struct work_struct *work) 570void ipoib_mcast_join_task(struct work_struct *work)
@@ -522,6 +573,9 @@ void ipoib_mcast_join_task(struct work_struct *work)
522 container_of(work, struct ipoib_dev_priv, mcast_task.work); 573 container_of(work, struct ipoib_dev_priv, mcast_task.work);
523 struct net_device *dev = priv->dev; 574 struct net_device *dev = priv->dev;
524 struct ib_port_attr port_attr; 575 struct ib_port_attr port_attr;
576 unsigned long delay_until = 0;
577 struct ipoib_mcast *mcast = NULL;
578 int create = 1;
525 579
526 if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) 580 if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
527 return; 581 return;
@@ -539,64 +593,102 @@ void ipoib_mcast_join_task(struct work_struct *work)
539 else 593 else
540 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); 594 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
541 595
596 /*
597 * We have to hold the mutex to keep from racing with the join
598 * completion threads on setting flags on mcasts, and we have
599 * to hold the priv->lock because dev_flush will remove entries
600 * out from underneath us, so at a minimum we need the lock
601 * through the time that we do the for_each loop of the mcast
602 * list or else dev_flush can make us oops.
603 */
604 mutex_lock(&mcast_mutex);
605 spin_lock_irq(&priv->lock);
606 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
607 goto out;
608
542 if (!priv->broadcast) { 609 if (!priv->broadcast) {
543 struct ipoib_mcast *broadcast; 610 struct ipoib_mcast *broadcast;
544 611
545 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) 612 broadcast = ipoib_mcast_alloc(dev, 0);
546 return;
547
548 broadcast = ipoib_mcast_alloc(dev, 1);
549 if (!broadcast) { 613 if (!broadcast) {
550 ipoib_warn(priv, "failed to allocate broadcast group\n"); 614 ipoib_warn(priv, "failed to allocate broadcast group\n");
551 mutex_lock(&mcast_mutex); 615 /*
552 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) 616 * Restart us after a 1 second delay to retry
553 queue_delayed_work(priv->wq, &priv->mcast_task, 617 * creating our broadcast group and attaching to
554 HZ); 618 * it. Until this succeeds, this ipoib dev is
555 mutex_unlock(&mcast_mutex); 619 * completely stalled (multicast wise).
556 return; 620 */
621 __ipoib_mcast_schedule_join_thread(priv, NULL, 1);
622 goto out;
557 } 623 }
558 624
559 spin_lock_irq(&priv->lock);
560 memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, 625 memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
561 sizeof (union ib_gid)); 626 sizeof (union ib_gid));
562 priv->broadcast = broadcast; 627 priv->broadcast = broadcast;
563 628
564 __ipoib_mcast_add(dev, priv->broadcast); 629 __ipoib_mcast_add(dev, priv->broadcast);
565 spin_unlock_irq(&priv->lock);
566 } 630 }
567 631
568 if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { 632 if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
569 if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) 633 if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
570 ipoib_mcast_join(dev, priv->broadcast, 0); 634 !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
571 return; 635 mcast = priv->broadcast;
636 create = 0;
637 if (mcast->backoff > 1 &&
638 time_before(jiffies, mcast->delay_until)) {
639 delay_until = mcast->delay_until;
640 mcast = NULL;
641 }
642 }
643 goto out;
572 } 644 }
573 645
574 while (1) { 646 /*
575 struct ipoib_mcast *mcast = NULL; 647 * We'll never get here until the broadcast group is both allocated
576 648 * and attached
577 spin_lock_irq(&priv->lock); 649 */
578 list_for_each_entry(mcast, &priv->multicast_list, list) { 650 list_for_each_entry(mcast, &priv->multicast_list, list) {
579 if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) 651 if (IS_ERR_OR_NULL(mcast->mc) &&
580 && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) 652 !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
581 && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { 653 !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
654 if (mcast->backoff == 1 ||
655 time_after_eq(jiffies, mcast->delay_until))
582 /* Found the next unjoined group */ 656 /* Found the next unjoined group */
583 break; 657 break;
584 } 658 else if (!delay_until ||
659 time_before(mcast->delay_until, delay_until))
660 delay_until = mcast->delay_until;
585 } 661 }
586 spin_unlock_irq(&priv->lock);
587
588 if (&mcast->list == &priv->multicast_list) {
589 /* All done */
590 break;
591 }
592
593 ipoib_mcast_join(dev, mcast, 1);
594 return;
595 } 662 }
596 663
597 ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); 664 if (&mcast->list == &priv->multicast_list) {
665 /*
666 * All done, unless we have delayed work from
667 * backoff retransmissions, but we will get
668 * restarted when the time is right, so we are
669 * done for now
670 */
671 mcast = NULL;
672 ipoib_dbg_mcast(priv, "successfully joined all "
673 "multicast groups\n");
674 }
598 675
599 clear_bit(IPOIB_MCAST_RUN, &priv->flags); 676out:
677 if (mcast) {
678 init_completion(&mcast->done);
679 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
680 }
681 spin_unlock_irq(&priv->lock);
682 mutex_unlock(&mcast_mutex);
683 if (mcast) {
684 if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
685 ipoib_mcast_sendonly_join(mcast);
686 else
687 ipoib_mcast_join(dev, mcast, create);
688 }
689 if (delay_until)
690 queue_delayed_work(priv->wq, &priv->mcast_task,
691 delay_until - jiffies);
600} 692}
601 693
602int ipoib_mcast_start_thread(struct net_device *dev) 694int ipoib_mcast_start_thread(struct net_device *dev)
@@ -606,8 +698,8 @@ int ipoib_mcast_start_thread(struct net_device *dev)
606 ipoib_dbg_mcast(priv, "starting multicast thread\n"); 698 ipoib_dbg_mcast(priv, "starting multicast thread\n");
607 699
608 mutex_lock(&mcast_mutex); 700 mutex_lock(&mcast_mutex);
609 if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) 701 set_bit(IPOIB_MCAST_RUN, &priv->flags);
610 queue_delayed_work(priv->wq, &priv->mcast_task, 0); 702 __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
611 mutex_unlock(&mcast_mutex); 703 mutex_unlock(&mcast_mutex);
612 704
613 return 0; 705 return 0;
@@ -635,7 +727,12 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
635 int ret = 0; 727 int ret = 0;
636 728
637 if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) 729 if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
730 ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
731
732 if (!IS_ERR_OR_NULL(mcast->mc))
638 ib_sa_free_multicast(mcast->mc); 733 ib_sa_free_multicast(mcast->mc);
734 else
735 ipoib_dbg(priv, "ipoib_mcast_leave with mcast->mc invalid\n");
639 736
640 if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { 737 if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
641 ipoib_dbg_mcast(priv, "leaving MGID %pI6\n", 738 ipoib_dbg_mcast(priv, "leaving MGID %pI6\n",
@@ -646,7 +743,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
646 be16_to_cpu(mcast->mcmember.mlid)); 743 be16_to_cpu(mcast->mcmember.mlid));
647 if (ret) 744 if (ret)
648 ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); 745 ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
649 } 746 } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
747 ipoib_dbg(priv, "leaving with no mcmember but not a "
748 "SENDONLY join\n");
650 749
651 return 0; 750 return 0;
652} 751}
@@ -687,6 +786,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
687 memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); 786 memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
688 __ipoib_mcast_add(dev, mcast); 787 __ipoib_mcast_add(dev, mcast);
689 list_add_tail(&mcast->list, &priv->multicast_list); 788 list_add_tail(&mcast->list, &priv->multicast_list);
789 __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
690 } 790 }
691 791
692 if (!mcast->ah) { 792 if (!mcast->ah) {
@@ -696,13 +796,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
696 ++dev->stats.tx_dropped; 796 ++dev->stats.tx_dropped;
697 dev_kfree_skb_any(skb); 797 dev_kfree_skb_any(skb);
698 } 798 }
699
700 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
701 ipoib_dbg_mcast(priv, "no address vector, "
702 "but multicast join already started\n");
703 else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
704 ipoib_mcast_sendonly_join(mcast);
705
706 /* 799 /*
707 * If lookup completes between here and out:, don't 800 * If lookup completes between here and out:, don't
708 * want to send packet twice. 801 * want to send packet twice.
@@ -761,9 +854,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
761 854
762 spin_unlock_irqrestore(&priv->lock, flags); 855 spin_unlock_irqrestore(&priv->lock, flags);
763 856
764 /* seperate between the wait to the leave*/ 857 /*
858 * make sure the in-flight joins have finished before we attempt
859 * to leave
860 */
765 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) 861 list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
766 if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) 862 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
767 wait_for_completion(&mcast->done); 863 wait_for_completion(&mcast->done);
768 864
769 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { 865 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
@@ -794,20 +890,14 @@ void ipoib_mcast_restart_task(struct work_struct *work)
794 unsigned long flags; 890 unsigned long flags;
795 struct ib_sa_mcmember_rec rec; 891 struct ib_sa_mcmember_rec rec;
796 892
797 ipoib_dbg_mcast(priv, "restarting multicast task\n"); 893 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
894 /*
895 * shortcut...on shutdown flush is called next, just
896 * let it do all the work
897 */
898 return;
798 899
799 /* 900 ipoib_dbg_mcast(priv, "restarting multicast task\n");
800 * We're running on the priv->wq right now, so we can't call
801 * mcast_stop_thread as it wants to flush the wq and that
802 * will deadlock. We don't actually *need* to stop the
803 * thread here anyway, so just clear the run flag, cancel
804 * any delayed work, do our work, remove the old entries,
805 * then restart the thread.
806 */
807 mutex_lock(&mcast_mutex);
808 clear_bit(IPOIB_MCAST_RUN, &priv->flags);
809 cancel_delayed_work(&priv->mcast_task);
810 mutex_unlock(&mcast_mutex);
811 901
812 local_irq_save(flags); 902 local_irq_save(flags);
813 netif_addr_lock(dev); 903 netif_addr_lock(dev);
@@ -893,14 +983,27 @@ void ipoib_mcast_restart_task(struct work_struct *work)
893 netif_addr_unlock(dev); 983 netif_addr_unlock(dev);
894 local_irq_restore(flags); 984 local_irq_restore(flags);
895 985
896 /* We have to cancel outside of the spinlock */ 986 /*
987 * make sure the in-flight joins have finished before we attempt
988 * to leave
989 */
990 list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
991 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
992 wait_for_completion(&mcast->done);
993
897 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { 994 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
898 ipoib_mcast_leave(mcast->dev, mcast); 995 ipoib_mcast_leave(mcast->dev, mcast);
899 ipoib_mcast_free(mcast); 996 ipoib_mcast_free(mcast);
900 } 997 }
901 998
902 if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) 999 /*
903 ipoib_mcast_start_thread(dev); 1000 * Double check that we are still up
1001 */
1002 if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
1003 spin_lock_irqsave(&priv->lock, flags);
1004 __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
1005 spin_unlock_irqrestore(&priv->lock, flags);
1006 }
904} 1007}
905 1008
906#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 1009#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG