aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSean Hefty <sean.hefty@intel.com>2007-02-15 20:00:17 -0500
committerRoland Dreier <rolandd@cisco.com>2007-02-16 17:20:02 -0500
commitfaec2f7b96b555055d0aa6cc6b83a537270bed52 (patch)
tree0916cb780a1a5d5fe2ac98364917c79f25d57fcf
parent8a2e65f87c66ab1e720f49378750cdd800f9e9cf (diff)
IB/sa: Track multicast join/leave requests
The IB SA tracks multicast join/leave requests on a per port basis and does not do any reference counting: if two users of the same port join the same group, and one leaves that group, then the SA will remove the port from the group even though there is one user who wants to stay a member left. Therefore, in order to support multiple users of the same multicast group from the same port, we need to perform reference counting locally. To do this, add an multicast submodule to ib_sa to perform reference counting of multicast join/leave operations. Modify ib_ipoib (the only in-kernel user of multicast) to use the new interface. Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r--drivers/infiniband/core/Makefile2
-rw-r--r--drivers/infiniband/core/multicast.c837
-rw-r--r--drivers/infiniband/core/sa.h66
-rw-r--r--drivers/infiniband/core/sa_query.c29
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c195
-rw-r--r--include/rdma/ib_addr.h6
-rw-r--r--include/rdma/ib_sa.h159
7 files changed, 1065 insertions, 229 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 50fb1cd447b7..189e5d4b9b17 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
12 12
13ib_mad-y := mad.o smi.o agent.o mad_rmpp.o 13ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
14 14
15ib_sa-y := sa_query.o 15ib_sa-y := sa_query.o multicast.o
16 16
17ib_cm-y := cm.o 17ib_cm-y := cm.o
18 18
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
new file mode 100644
index 000000000000..4a579b3a1c90
--- /dev/null
+++ b/drivers/infiniband/core/multicast.c
@@ -0,0 +1,837 @@
1/*
2 * Copyright (c) 2006 Intel Corporation.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/completion.h>
34#include <linux/dma-mapping.h>
35#include <linux/err.h>
36#include <linux/interrupt.h>
37#include <linux/pci.h>
38#include <linux/bitops.h>
39#include <linux/random.h>
40
41#include <rdma/ib_cache.h>
42#include "sa.h"
43
44static void mcast_add_one(struct ib_device *device);
45static void mcast_remove_one(struct ib_device *device);
46
47static struct ib_client mcast_client = {
48 .name = "ib_multicast",
49 .add = mcast_add_one,
50 .remove = mcast_remove_one
51};
52
53static struct ib_sa_client sa_client;
54static struct workqueue_struct *mcast_wq;
55static union ib_gid mgid0;
56
57struct mcast_device;
58
59struct mcast_port {
60 struct mcast_device *dev;
61 spinlock_t lock;
62 struct rb_root table;
63 atomic_t refcount;
64 struct completion comp;
65 u8 port_num;
66};
67
68struct mcast_device {
69 struct ib_device *device;
70 struct ib_event_handler event_handler;
71 int start_port;
72 int end_port;
73 struct mcast_port port[0];
74};
75
76enum mcast_state {
77 MCAST_IDLE,
78 MCAST_JOINING,
79 MCAST_MEMBER,
80 MCAST_BUSY,
81 MCAST_ERROR
82};
83
84struct mcast_member;
85
86struct mcast_group {
87 struct ib_sa_mcmember_rec rec;
88 struct rb_node node;
89 struct mcast_port *port;
90 spinlock_t lock;
91 struct work_struct work;
92 struct list_head pending_list;
93 struct list_head active_list;
94 struct mcast_member *last_join;
95 int members[3];
96 atomic_t refcount;
97 enum mcast_state state;
98 struct ib_sa_query *query;
99 int query_id;
100};
101
102struct mcast_member {
103 struct ib_sa_multicast multicast;
104 struct ib_sa_client *client;
105 struct mcast_group *group;
106 struct list_head list;
107 enum mcast_state state;
108 atomic_t refcount;
109 struct completion comp;
110};
111
112static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
113 void *context);
114static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
115 void *context);
116
117static struct mcast_group *mcast_find(struct mcast_port *port,
118 union ib_gid *mgid)
119{
120 struct rb_node *node = port->table.rb_node;
121 struct mcast_group *group;
122 int ret;
123
124 while (node) {
125 group = rb_entry(node, struct mcast_group, node);
126 ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
127 if (!ret)
128 return group;
129
130 if (ret < 0)
131 node = node->rb_left;
132 else
133 node = node->rb_right;
134 }
135 return NULL;
136}
137
138static struct mcast_group *mcast_insert(struct mcast_port *port,
139 struct mcast_group *group,
140 int allow_duplicates)
141{
142 struct rb_node **link = &port->table.rb_node;
143 struct rb_node *parent = NULL;
144 struct mcast_group *cur_group;
145 int ret;
146
147 while (*link) {
148 parent = *link;
149 cur_group = rb_entry(parent, struct mcast_group, node);
150
151 ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
152 sizeof group->rec.mgid);
153 if (ret < 0)
154 link = &(*link)->rb_left;
155 else if (ret > 0)
156 link = &(*link)->rb_right;
157 else if (allow_duplicates)
158 link = &(*link)->rb_left;
159 else
160 return cur_group;
161 }
162 rb_link_node(&group->node, parent, link);
163 rb_insert_color(&group->node, &port->table);
164 return NULL;
165}
166
167static void deref_port(struct mcast_port *port)
168{
169 if (atomic_dec_and_test(&port->refcount))
170 complete(&port->comp);
171}
172
173static void release_group(struct mcast_group *group)
174{
175 struct mcast_port *port = group->port;
176 unsigned long flags;
177
178 spin_lock_irqsave(&port->lock, flags);
179 if (atomic_dec_and_test(&group->refcount)) {
180 rb_erase(&group->node, &port->table);
181 spin_unlock_irqrestore(&port->lock, flags);
182 kfree(group);
183 deref_port(port);
184 } else
185 spin_unlock_irqrestore(&port->lock, flags);
186}
187
188static void deref_member(struct mcast_member *member)
189{
190 if (atomic_dec_and_test(&member->refcount))
191 complete(&member->comp);
192}
193
194static void queue_join(struct mcast_member *member)
195{
196 struct mcast_group *group = member->group;
197 unsigned long flags;
198
199 spin_lock_irqsave(&group->lock, flags);
200 list_add(&member->list, &group->pending_list);
201 if (group->state == MCAST_IDLE) {
202 group->state = MCAST_BUSY;
203 atomic_inc(&group->refcount);
204 queue_work(mcast_wq, &group->work);
205 }
206 spin_unlock_irqrestore(&group->lock, flags);
207}
208
209/*
210 * A multicast group has three types of members: full member, non member, and
211 * send only member. We need to keep track of the number of members of each
212 * type based on their join state. Adjust the number of members the belong to
213 * the specified join states.
214 */
215static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
216{
217 int i;
218
219 for (i = 0; i < 3; i++, join_state >>= 1)
220 if (join_state & 0x1)
221 group->members[i] += inc;
222}
223
224/*
225 * If a multicast group has zero members left for a particular join state, but
226 * the group is still a member with the SA, we need to leave that join state.
227 * Determine which join states we still belong to, but that do not have any
228 * active members.
229 */
230static u8 get_leave_state(struct mcast_group *group)
231{
232 u8 leave_state = 0;
233 int i;
234
235 for (i = 0; i < 3; i++)
236 if (!group->members[i])
237 leave_state |= (0x1 << i);
238
239 return leave_state & group->rec.join_state;
240}
241
242static int check_selector(ib_sa_comp_mask comp_mask,
243 ib_sa_comp_mask selector_mask,
244 ib_sa_comp_mask value_mask,
245 u8 selector, u8 src_value, u8 dst_value)
246{
247 int err;
248
249 if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
250 return 0;
251
252 switch (selector) {
253 case IB_SA_GT:
254 err = (src_value <= dst_value);
255 break;
256 case IB_SA_LT:
257 err = (src_value >= dst_value);
258 break;
259 case IB_SA_EQ:
260 err = (src_value != dst_value);
261 break;
262 default:
263 err = 0;
264 break;
265 }
266
267 return err;
268}
269
270static int cmp_rec(struct ib_sa_mcmember_rec *src,
271 struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
272{
273 /* MGID must already match */
274
275 if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
276 memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
277 return -EINVAL;
278 if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
279 return -EINVAL;
280 if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
281 return -EINVAL;
282 if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
283 IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
284 src->mtu, dst->mtu))
285 return -EINVAL;
286 if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
287 src->traffic_class != dst->traffic_class)
288 return -EINVAL;
289 if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
290 return -EINVAL;
291 if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
292 IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
293 src->rate, dst->rate))
294 return -EINVAL;
295 if (check_selector(comp_mask,
296 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
297 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
298 dst->packet_life_time_selector,
299 src->packet_life_time, dst->packet_life_time))
300 return -EINVAL;
301 if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
302 return -EINVAL;
303 if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
304 src->flow_label != dst->flow_label)
305 return -EINVAL;
306 if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
307 src->hop_limit != dst->hop_limit)
308 return -EINVAL;
309 if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
310 return -EINVAL;
311
312 /* join_state checked separately, proxy_join ignored */
313
314 return 0;
315}
316
317static int send_join(struct mcast_group *group, struct mcast_member *member)
318{
319 struct mcast_port *port = group->port;
320 int ret;
321
322 group->last_join = member;
323 ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
324 port->port_num, IB_MGMT_METHOD_SET,
325 &member->multicast.rec,
326 member->multicast.comp_mask,
327 3000, GFP_KERNEL, join_handler, group,
328 &group->query);
329 if (ret >= 0) {
330 group->query_id = ret;
331 ret = 0;
332 }
333 return ret;
334}
335
336static int send_leave(struct mcast_group *group, u8 leave_state)
337{
338 struct mcast_port *port = group->port;
339 struct ib_sa_mcmember_rec rec;
340 int ret;
341
342 rec = group->rec;
343 rec.join_state = leave_state;
344
345 ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
346 port->port_num, IB_SA_METHOD_DELETE, &rec,
347 IB_SA_MCMEMBER_REC_MGID |
348 IB_SA_MCMEMBER_REC_PORT_GID |
349 IB_SA_MCMEMBER_REC_JOIN_STATE,
350 3000, GFP_KERNEL, leave_handler,
351 group, &group->query);
352 if (ret >= 0) {
353 group->query_id = ret;
354 ret = 0;
355 }
356 return ret;
357}
358
359static void join_group(struct mcast_group *group, struct mcast_member *member,
360 u8 join_state)
361{
362 member->state = MCAST_MEMBER;
363 adjust_membership(group, join_state, 1);
364 group->rec.join_state |= join_state;
365 member->multicast.rec = group->rec;
366 member->multicast.rec.join_state = join_state;
367 list_move(&member->list, &group->active_list);
368}
369
370static int fail_join(struct mcast_group *group, struct mcast_member *member,
371 int status)
372{
373 spin_lock_irq(&group->lock);
374 list_del_init(&member->list);
375 spin_unlock_irq(&group->lock);
376 return member->multicast.callback(status, &member->multicast);
377}
378
379static void process_group_error(struct mcast_group *group)
380{
381 struct mcast_member *member;
382 int ret;
383
384 spin_lock_irq(&group->lock);
385 while (!list_empty(&group->active_list)) {
386 member = list_entry(group->active_list.next,
387 struct mcast_member, list);
388 atomic_inc(&member->refcount);
389 list_del_init(&member->list);
390 adjust_membership(group, member->multicast.rec.join_state, -1);
391 member->state = MCAST_ERROR;
392 spin_unlock_irq(&group->lock);
393
394 ret = member->multicast.callback(-ENETRESET,
395 &member->multicast);
396 deref_member(member);
397 if (ret)
398 ib_sa_free_multicast(&member->multicast);
399 spin_lock_irq(&group->lock);
400 }
401
402 group->rec.join_state = 0;
403 group->state = MCAST_BUSY;
404 spin_unlock_irq(&group->lock);
405}
406
407static void mcast_work_handler(struct work_struct *work)
408{
409 struct mcast_group *group;
410 struct mcast_member *member;
411 struct ib_sa_multicast *multicast;
412 int status, ret;
413 u8 join_state;
414
415 group = container_of(work, typeof(*group), work);
416retest:
417 spin_lock_irq(&group->lock);
418 while (!list_empty(&group->pending_list) ||
419 (group->state == MCAST_ERROR)) {
420
421 if (group->state == MCAST_ERROR) {
422 spin_unlock_irq(&group->lock);
423 process_group_error(group);
424 goto retest;
425 }
426
427 member = list_entry(group->pending_list.next,
428 struct mcast_member, list);
429 multicast = &member->multicast;
430 join_state = multicast->rec.join_state;
431 atomic_inc(&member->refcount);
432
433 if (join_state == (group->rec.join_state & join_state)) {
434 status = cmp_rec(&group->rec, &multicast->rec,
435 multicast->comp_mask);
436 if (!status)
437 join_group(group, member, join_state);
438 else
439 list_del_init(&member->list);
440 spin_unlock_irq(&group->lock);
441 ret = multicast->callback(status, multicast);
442 } else {
443 spin_unlock_irq(&group->lock);
444 status = send_join(group, member);
445 if (!status) {
446 deref_member(member);
447 return;
448 }
449 ret = fail_join(group, member, status);
450 }
451
452 deref_member(member);
453 if (ret)
454 ib_sa_free_multicast(&member->multicast);
455 spin_lock_irq(&group->lock);
456 }
457
458 join_state = get_leave_state(group);
459 if (join_state) {
460 group->rec.join_state &= ~join_state;
461 spin_unlock_irq(&group->lock);
462 if (send_leave(group, join_state))
463 goto retest;
464 } else {
465 group->state = MCAST_IDLE;
466 spin_unlock_irq(&group->lock);
467 release_group(group);
468 }
469}
470
471/*
472 * Fail a join request if it is still active - at the head of the pending queue.
473 */
474static void process_join_error(struct mcast_group *group, int status)
475{
476 struct mcast_member *member;
477 int ret;
478
479 spin_lock_irq(&group->lock);
480 member = list_entry(group->pending_list.next,
481 struct mcast_member, list);
482 if (group->last_join == member) {
483 atomic_inc(&member->refcount);
484 list_del_init(&member->list);
485 spin_unlock_irq(&group->lock);
486 ret = member->multicast.callback(status, &member->multicast);
487 deref_member(member);
488 if (ret)
489 ib_sa_free_multicast(&member->multicast);
490 } else
491 spin_unlock_irq(&group->lock);
492}
493
494static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
495 void *context)
496{
497 struct mcast_group *group = context;
498
499 if (status)
500 process_join_error(group, status);
501 else {
502 spin_lock_irq(&group->port->lock);
503 group->rec = *rec;
504 if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
505 rb_erase(&group->node, &group->port->table);
506 mcast_insert(group->port, group, 1);
507 }
508 spin_unlock_irq(&group->port->lock);
509 }
510 mcast_work_handler(&group->work);
511}
512
513static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
514 void *context)
515{
516 struct mcast_group *group = context;
517
518 mcast_work_handler(&group->work);
519}
520
521static struct mcast_group *acquire_group(struct mcast_port *port,
522 union ib_gid *mgid, gfp_t gfp_mask)
523{
524 struct mcast_group *group, *cur_group;
525 unsigned long flags;
526 int is_mgid0;
527
528 is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
529 if (!is_mgid0) {
530 spin_lock_irqsave(&port->lock, flags);
531 group = mcast_find(port, mgid);
532 if (group)
533 goto found;
534 spin_unlock_irqrestore(&port->lock, flags);
535 }
536
537 group = kzalloc(sizeof *group, gfp_mask);
538 if (!group)
539 return NULL;
540
541 group->port = port;
542 group->rec.mgid = *mgid;
543 INIT_LIST_HEAD(&group->pending_list);
544 INIT_LIST_HEAD(&group->active_list);
545 INIT_WORK(&group->work, mcast_work_handler);
546 spin_lock_init(&group->lock);
547
548 spin_lock_irqsave(&port->lock, flags);
549 cur_group = mcast_insert(port, group, is_mgid0);
550 if (cur_group) {
551 kfree(group);
552 group = cur_group;
553 } else
554 atomic_inc(&port->refcount);
555found:
556 atomic_inc(&group->refcount);
557 spin_unlock_irqrestore(&port->lock, flags);
558 return group;
559}
560
561/*
562 * We serialize all join requests to a single group to make our lives much
563 * easier. Otherwise, two users could try to join the same group
564 * simultaneously, with different configurations, one could leave while the
565 * join is in progress, etc., which makes locking around error recovery
566 * difficult.
567 */
568struct ib_sa_multicast *
569ib_sa_join_multicast(struct ib_sa_client *client,
570 struct ib_device *device, u8 port_num,
571 struct ib_sa_mcmember_rec *rec,
572 ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
573 int (*callback)(int status,
574 struct ib_sa_multicast *multicast),
575 void *context)
576{
577 struct mcast_device *dev;
578 struct mcast_member *member;
579 struct ib_sa_multicast *multicast;
580 int ret;
581
582 dev = ib_get_client_data(device, &mcast_client);
583 if (!dev)
584 return ERR_PTR(-ENODEV);
585
586 member = kmalloc(sizeof *member, gfp_mask);
587 if (!member)
588 return ERR_PTR(-ENOMEM);
589
590 ib_sa_client_get(client);
591 member->client = client;
592 member->multicast.rec = *rec;
593 member->multicast.comp_mask = comp_mask;
594 member->multicast.callback = callback;
595 member->multicast.context = context;
596 init_completion(&member->comp);
597 atomic_set(&member->refcount, 1);
598 member->state = MCAST_JOINING;
599
600 member->group = acquire_group(&dev->port[port_num - dev->start_port],
601 &rec->mgid, gfp_mask);
602 if (!member->group) {
603 ret = -ENOMEM;
604 goto err;
605 }
606
607 /*
608 * The user will get the multicast structure in their callback. They
609 * could then free the multicast structure before we can return from
610 * this routine. So we save the pointer to return before queuing
611 * any callback.
612 */
613 multicast = &member->multicast;
614 queue_join(member);
615 return multicast;
616
617err:
618 ib_sa_client_put(client);
619 kfree(member);
620 return ERR_PTR(ret);
621}
622EXPORT_SYMBOL(ib_sa_join_multicast);
623
624void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
625{
626 struct mcast_member *member;
627 struct mcast_group *group;
628
629 member = container_of(multicast, struct mcast_member, multicast);
630 group = member->group;
631
632 spin_lock_irq(&group->lock);
633 if (member->state == MCAST_MEMBER)
634 adjust_membership(group, multicast->rec.join_state, -1);
635
636 list_del_init(&member->list);
637
638 if (group->state == MCAST_IDLE) {
639 group->state = MCAST_BUSY;
640 spin_unlock_irq(&group->lock);
641 /* Continue to hold reference on group until callback */
642 queue_work(mcast_wq, &group->work);
643 } else {
644 spin_unlock_irq(&group->lock);
645 release_group(group);
646 }
647
648 deref_member(member);
649 wait_for_completion(&member->comp);
650 ib_sa_client_put(member->client);
651 kfree(member);
652}
653EXPORT_SYMBOL(ib_sa_free_multicast);
654
655int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
656 union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
657{
658 struct mcast_device *dev;
659 struct mcast_port *port;
660 struct mcast_group *group;
661 unsigned long flags;
662 int ret = 0;
663
664 dev = ib_get_client_data(device, &mcast_client);
665 if (!dev)
666 return -ENODEV;
667
668 port = &dev->port[port_num - dev->start_port];
669 spin_lock_irqsave(&port->lock, flags);
670 group = mcast_find(port, mgid);
671 if (group)
672 *rec = group->rec;
673 else
674 ret = -EADDRNOTAVAIL;
675 spin_unlock_irqrestore(&port->lock, flags);
676
677 return ret;
678}
679EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
680
681int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
682 struct ib_sa_mcmember_rec *rec,
683 struct ib_ah_attr *ah_attr)
684{
685 int ret;
686 u16 gid_index;
687 u8 p;
688
689 ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
690 if (ret)
691 return ret;
692
693 memset(ah_attr, 0, sizeof *ah_attr);
694 ah_attr->dlid = be16_to_cpu(rec->mlid);
695 ah_attr->sl = rec->sl;
696 ah_attr->port_num = port_num;
697 ah_attr->static_rate = rec->rate;
698
699 ah_attr->ah_flags = IB_AH_GRH;
700 ah_attr->grh.dgid = rec->mgid;
701
702 ah_attr->grh.sgid_index = (u8) gid_index;
703 ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
704 ah_attr->grh.hop_limit = rec->hop_limit;
705 ah_attr->grh.traffic_class = rec->traffic_class;
706
707 return 0;
708}
709EXPORT_SYMBOL(ib_init_ah_from_mcmember);
710
711static void mcast_groups_lost(struct mcast_port *port)
712{
713 struct mcast_group *group;
714 struct rb_node *node;
715 unsigned long flags;
716
717 spin_lock_irqsave(&port->lock, flags);
718 for (node = rb_first(&port->table); node; node = rb_next(node)) {
719 group = rb_entry(node, struct mcast_group, node);
720 spin_lock(&group->lock);
721 if (group->state == MCAST_IDLE) {
722 atomic_inc(&group->refcount);
723 queue_work(mcast_wq, &group->work);
724 }
725 group->state = MCAST_ERROR;
726 spin_unlock(&group->lock);
727 }
728 spin_unlock_irqrestore(&port->lock, flags);
729}
730
731static void mcast_event_handler(struct ib_event_handler *handler,
732 struct ib_event *event)
733{
734 struct mcast_device *dev;
735
736 dev = container_of(handler, struct mcast_device, event_handler);
737
738 switch (event->event) {
739 case IB_EVENT_PORT_ERR:
740 case IB_EVENT_LID_CHANGE:
741 case IB_EVENT_SM_CHANGE:
742 case IB_EVENT_CLIENT_REREGISTER:
743 mcast_groups_lost(&dev->port[event->element.port_num -
744 dev->start_port]);
745 break;
746 default:
747 break;
748 }
749}
750
751static void mcast_add_one(struct ib_device *device)
752{
753 struct mcast_device *dev;
754 struct mcast_port *port;
755 int i;
756
757 if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
758 return;
759
760 dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
761 GFP_KERNEL);
762 if (!dev)
763 return;
764
765 if (device->node_type == RDMA_NODE_IB_SWITCH)
766 dev->start_port = dev->end_port = 0;
767 else {
768 dev->start_port = 1;
769 dev->end_port = device->phys_port_cnt;
770 }
771
772 for (i = 0; i <= dev->end_port - dev->start_port; i++) {
773 port = &dev->port[i];
774 port->dev = dev;
775 port->port_num = dev->start_port + i;
776 spin_lock_init(&port->lock);
777 port->table = RB_ROOT;
778 init_completion(&port->comp);
779 atomic_set(&port->refcount, 1);
780 }
781
782 dev->device = device;
783 ib_set_client_data(device, &mcast_client, dev);
784
785 INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
786 ib_register_event_handler(&dev->event_handler);
787}
788
789static void mcast_remove_one(struct ib_device *device)
790{
791 struct mcast_device *dev;
792 struct mcast_port *port;
793 int i;
794
795 dev = ib_get_client_data(device, &mcast_client);
796 if (!dev)
797 return;
798
799 ib_unregister_event_handler(&dev->event_handler);
800 flush_workqueue(mcast_wq);
801
802 for (i = 0; i <= dev->end_port - dev->start_port; i++) {
803 port = &dev->port[i];
804 deref_port(port);
805 wait_for_completion(&port->comp);
806 }
807
808 kfree(dev);
809}
810
811int mcast_init(void)
812{
813 int ret;
814
815 mcast_wq = create_singlethread_workqueue("ib_mcast");
816 if (!mcast_wq)
817 return -ENOMEM;
818
819 ib_sa_register_client(&sa_client);
820
821 ret = ib_register_client(&mcast_client);
822 if (ret)
823 goto err;
824 return 0;
825
826err:
827 ib_sa_unregister_client(&sa_client);
828 destroy_workqueue(mcast_wq);
829 return ret;
830}
831
832void mcast_cleanup(void)
833{
834 ib_unregister_client(&mcast_client);
835 ib_sa_unregister_client(&sa_client);
836 destroy_workqueue(mcast_wq);
837}
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
new file mode 100644
index 000000000000..24c93fd320fb
--- /dev/null
+++ b/drivers/infiniband/core/sa.h
@@ -0,0 +1,66 @@
1/*
2 * Copyright (c) 2004 Topspin Communications. All rights reserved.
3 * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
4 * Copyright (c) 2006 Intel Corporation. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
14 * conditions are met:
15 *
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
18 * disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35#ifndef SA_H
36#define SA_H
37
38#include <rdma/ib_sa.h>
39
40static inline void ib_sa_client_get(struct ib_sa_client *client)
41{
42 atomic_inc(&client->users);
43}
44
45static inline void ib_sa_client_put(struct ib_sa_client *client)
46{
47 if (atomic_dec_and_test(&client->users))
48 complete(&client->comp);
49}
50
51int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
52 struct ib_device *device, u8 port_num,
53 u8 method,
54 struct ib_sa_mcmember_rec *rec,
55 ib_sa_comp_mask comp_mask,
56 int timeout_ms, gfp_t gfp_mask,
57 void (*callback)(int status,
58 struct ib_sa_mcmember_rec *resp,
59 void *context),
60 void *context,
61 struct ib_sa_query **sa_query);
62
63int mcast_init(void);
64void mcast_cleanup(void);
65
66#endif /* SA_H */
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index e45afba75341..d7d4a5309ba9 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -47,8 +47,8 @@
47#include <linux/workqueue.h> 47#include <linux/workqueue.h>
48 48
49#include <rdma/ib_pack.h> 49#include <rdma/ib_pack.h>
50#include <rdma/ib_sa.h>
51#include <rdma/ib_cache.h> 50#include <rdma/ib_cache.h>
51#include "sa.h"
52 52
53MODULE_AUTHOR("Roland Dreier"); 53MODULE_AUTHOR("Roland Dreier");
54MODULE_DESCRIPTION("InfiniBand subnet administration query support"); 54MODULE_DESCRIPTION("InfiniBand subnet administration query support");
@@ -425,17 +425,6 @@ void ib_sa_register_client(struct ib_sa_client *client)
425} 425}
426EXPORT_SYMBOL(ib_sa_register_client); 426EXPORT_SYMBOL(ib_sa_register_client);
427 427
428static inline void ib_sa_client_get(struct ib_sa_client *client)
429{
430 atomic_inc(&client->users);
431}
432
433static inline void ib_sa_client_put(struct ib_sa_client *client)
434{
435 if (atomic_dec_and_test(&client->users))
436 complete(&client->comp);
437}
438
439void ib_sa_unregister_client(struct ib_sa_client *client) 428void ib_sa_unregister_client(struct ib_sa_client *client)
440{ 429{
441 ib_sa_client_put(client); 430 ib_sa_client_put(client);
@@ -901,7 +890,6 @@ err1:
901 kfree(query); 890 kfree(query);
902 return ret; 891 return ret;
903} 892}
904EXPORT_SYMBOL(ib_sa_mcmember_rec_query);
905 893
906static void send_handler(struct ib_mad_agent *agent, 894static void send_handler(struct ib_mad_agent *agent,
907 struct ib_mad_send_wc *mad_send_wc) 895 struct ib_mad_send_wc *mad_send_wc)
@@ -1053,14 +1041,27 @@ static int __init ib_sa_init(void)
1053 get_random_bytes(&tid, sizeof tid); 1041 get_random_bytes(&tid, sizeof tid);
1054 1042
1055 ret = ib_register_client(&sa_client); 1043 ret = ib_register_client(&sa_client);
1056 if (ret) 1044 if (ret) {
1057 printk(KERN_ERR "Couldn't register ib_sa client\n"); 1045 printk(KERN_ERR "Couldn't register ib_sa client\n");
1046 goto err1;
1047 }
1048
1049 ret = mcast_init();
1050 if (ret) {
1051 printk(KERN_ERR "Couldn't initialize multicast handling\n");
1052 goto err2;
1053 }
1058 1054
1055 return 0;
1056err2:
1057 ib_unregister_client(&sa_client);
1058err1:
1059 return ret; 1059 return ret;
1060} 1060}
1061 1061
1062static void __exit ib_sa_cleanup(void) 1062static void __exit ib_sa_cleanup(void)
1063{ 1063{
1064 mcast_cleanup();
1064 ib_unregister_client(&sa_client); 1065 ib_unregister_client(&sa_client);
1065 idr_destroy(&query_idr); 1066 idr_destroy(&query_idr);
1066} 1067}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index fea737f520fd..b303ce6bc21e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -60,14 +60,11 @@ static DEFINE_MUTEX(mcast_mutex);
60/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ 60/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
61struct ipoib_mcast { 61struct ipoib_mcast {
62 struct ib_sa_mcmember_rec mcmember; 62 struct ib_sa_mcmember_rec mcmember;
63 struct ib_sa_multicast *mc;
63 struct ipoib_ah *ah; 64 struct ipoib_ah *ah;
64 65
65 struct rb_node rb_node; 66 struct rb_node rb_node;
66 struct list_head list; 67 struct list_head list;
67 struct completion done;
68
69 int query_id;
70 struct ib_sa_query *query;
71 68
72 unsigned long created; 69 unsigned long created;
73 unsigned long backoff; 70 unsigned long backoff;
@@ -299,18 +296,22 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
299 return 0; 296 return 0;
300} 297}
301 298
302static void 299static int
303ipoib_mcast_sendonly_join_complete(int status, 300ipoib_mcast_sendonly_join_complete(int status,
304 struct ib_sa_mcmember_rec *mcmember, 301 struct ib_sa_multicast *multicast)
305 void *mcast_ptr)
306{ 302{
307 struct ipoib_mcast *mcast = mcast_ptr; 303 struct ipoib_mcast *mcast = multicast->context;
308 struct net_device *dev = mcast->dev; 304 struct net_device *dev = mcast->dev;
309 struct ipoib_dev_priv *priv = netdev_priv(dev); 305 struct ipoib_dev_priv *priv = netdev_priv(dev);
310 306
307 /* We trap for port events ourselves. */
308 if (status == -ENETRESET)
309 return 0;
310
311 if (!status) 311 if (!status)
312 ipoib_mcast_join_finish(mcast, mcmember); 312 status = ipoib_mcast_join_finish(mcast, &multicast->rec);
313 else { 313
314 if (status) {
314 if (mcast->logcount++ < 20) 315 if (mcast->logcount++ < 20)
315 ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for " 316 ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
316 IPOIB_GID_FMT ", status %d\n", 317 IPOIB_GID_FMT ", status %d\n",
@@ -325,11 +326,10 @@ ipoib_mcast_sendonly_join_complete(int status,
325 spin_unlock_irq(&priv->tx_lock); 326 spin_unlock_irq(&priv->tx_lock);
326 327
327 /* Clear the busy flag so we try again */ 328 /* Clear the busy flag so we try again */
328 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 329 status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
329 mcast->query = NULL; 330 &mcast->flags);
330 } 331 }
331 332 return status;
332 complete(&mcast->done);
333} 333}
334 334
335static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) 335static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
@@ -359,35 +359,33 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
359 rec.port_gid = priv->local_gid; 359 rec.port_gid = priv->local_gid;
360 rec.pkey = cpu_to_be16(priv->pkey); 360 rec.pkey = cpu_to_be16(priv->pkey);
361 361
362 init_completion(&mcast->done); 362 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
363 363 priv->port, &rec,
364 ret = ib_sa_mcmember_rec_set(&ipoib_sa_client, priv->ca, priv->port, &rec, 364 IB_SA_MCMEMBER_REC_MGID |
365 IB_SA_MCMEMBER_REC_MGID | 365 IB_SA_MCMEMBER_REC_PORT_GID |
366 IB_SA_MCMEMBER_REC_PORT_GID | 366 IB_SA_MCMEMBER_REC_PKEY |
367 IB_SA_MCMEMBER_REC_PKEY | 367 IB_SA_MCMEMBER_REC_JOIN_STATE,
368 IB_SA_MCMEMBER_REC_JOIN_STATE, 368 GFP_ATOMIC,
369 1000, GFP_ATOMIC, 369 ipoib_mcast_sendonly_join_complete,
370 ipoib_mcast_sendonly_join_complete, 370 mcast);
371 mcast, &mcast->query); 371 if (IS_ERR(mcast->mc)) {
372 if (ret < 0) { 372 ret = PTR_ERR(mcast->mc);
373 ipoib_warn(priv, "ib_sa_mcmember_rec_set failed (ret = %d)\n", 373 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
374 ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
374 ret); 375 ret);
375 } else { 376 } else {
376 ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT 377 ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT
377 ", starting join\n", 378 ", starting join\n",
378 IPOIB_GID_ARG(mcast->mcmember.mgid)); 379 IPOIB_GID_ARG(mcast->mcmember.mgid));
379
380 mcast->query_id = ret;
381 } 380 }
382 381
383 return ret; 382 return ret;
384} 383}
385 384
386static void ipoib_mcast_join_complete(int status, 385static int ipoib_mcast_join_complete(int status,
387 struct ib_sa_mcmember_rec *mcmember, 386 struct ib_sa_multicast *multicast)
388 void *mcast_ptr)
389{ 387{
390 struct ipoib_mcast *mcast = mcast_ptr; 388 struct ipoib_mcast *mcast = multicast->context;
391 struct net_device *dev = mcast->dev; 389 struct net_device *dev = mcast->dev;
392 struct ipoib_dev_priv *priv = netdev_priv(dev); 390 struct ipoib_dev_priv *priv = netdev_priv(dev);
393 391
@@ -395,24 +393,25 @@ static void ipoib_mcast_join_complete(int status,
395 " (status %d)\n", 393 " (status %d)\n",
396 IPOIB_GID_ARG(mcast->mcmember.mgid), status); 394 IPOIB_GID_ARG(mcast->mcmember.mgid), status);
397 395
398 if (!status && !ipoib_mcast_join_finish(mcast, mcmember)) { 396 /* We trap for port events ourselves. */
397 if (status == -ENETRESET)
398 return 0;
399
400 if (!status)
401 status = ipoib_mcast_join_finish(mcast, &multicast->rec);
402
403 if (!status) {
399 mcast->backoff = 1; 404 mcast->backoff = 1;
400 mutex_lock(&mcast_mutex); 405 mutex_lock(&mcast_mutex);
401 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) 406 if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
402 queue_delayed_work(ipoib_workqueue, 407 queue_delayed_work(ipoib_workqueue,
403 &priv->mcast_task, 0); 408 &priv->mcast_task, 0);
404 mutex_unlock(&mcast_mutex); 409 mutex_unlock(&mcast_mutex);
405 complete(&mcast->done); 410 return 0;
406 return;
407 }
408
409 if (status == -EINTR) {
410 complete(&mcast->done);
411 return;
412 } 411 }
413 412
414 if (status && mcast->logcount++ < 20) { 413 if (mcast->logcount++ < 20) {
415 if (status == -ETIMEDOUT || status == -EINTR) { 414 if (status == -ETIMEDOUT) {
416 ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT 415 ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT
417 ", status %d\n", 416 ", status %d\n",
418 IPOIB_GID_ARG(mcast->mcmember.mgid), 417 IPOIB_GID_ARG(mcast->mcmember.mgid),
@@ -429,24 +428,18 @@ static void ipoib_mcast_join_complete(int status,
429 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) 428 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
430 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; 429 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
431 430
432 mutex_lock(&mcast_mutex); 431 /* Clear the busy flag so we try again */
432 status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
433 433
434 mutex_lock(&mcast_mutex);
434 spin_lock_irq(&priv->lock); 435 spin_lock_irq(&priv->lock);
435 mcast->query = NULL; 436 if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
436 437 queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
437 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) { 438 mcast->backoff * HZ);
438 if (status == -ETIMEDOUT)
439 queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
440 0);
441 else
442 queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
443 mcast->backoff * HZ);
444 } else
445 complete(&mcast->done);
446 spin_unlock_irq(&priv->lock); 439 spin_unlock_irq(&priv->lock);
447 mutex_unlock(&mcast_mutex); 440 mutex_unlock(&mcast_mutex);
448 441
449 return; 442 return status;
450} 443}
451 444
452static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, 445static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
@@ -495,15 +488,14 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
495 rec.hop_limit = priv->broadcast->mcmember.hop_limit; 488 rec.hop_limit = priv->broadcast->mcmember.hop_limit;
496 } 489 }
497 490
498 init_completion(&mcast->done); 491 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
499 492 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
500 ret = ib_sa_mcmember_rec_set(&ipoib_sa_client, priv->ca, priv->port, 493 &rec, comp_mask, GFP_KERNEL,
501 &rec, comp_mask, mcast->backoff * 1000, 494 ipoib_mcast_join_complete, mcast);
502 GFP_ATOMIC, ipoib_mcast_join_complete, 495 if (IS_ERR(mcast->mc)) {
503 mcast, &mcast->query); 496 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
504 497 ret = PTR_ERR(mcast->mc);
505 if (ret < 0) { 498 ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
506 ipoib_warn(priv, "ib_sa_mcmember_rec_set failed, status %d\n", ret);
507 499
508 mcast->backoff *= 2; 500 mcast->backoff *= 2;
509 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) 501 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
@@ -515,8 +507,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
515 &priv->mcast_task, 507 &priv->mcast_task,
516 mcast->backoff * HZ); 508 mcast->backoff * HZ);
517 mutex_unlock(&mcast_mutex); 509 mutex_unlock(&mcast_mutex);
518 } else 510 }
519 mcast->query_id = ret;
520} 511}
521 512
522void ipoib_mcast_join_task(struct work_struct *work) 513void ipoib_mcast_join_task(struct work_struct *work)
@@ -541,7 +532,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
541 priv->local_rate = attr.active_speed * 532 priv->local_rate = attr.active_speed *
542 ib_width_enum_to_int(attr.active_width); 533 ib_width_enum_to_int(attr.active_width);
543 } else 534 } else
544 ipoib_warn(priv, "ib_query_port failed\n"); 535 ipoib_warn(priv, "ib_query_port failed\n");
545 } 536 }
546 537
547 if (!priv->broadcast) { 538 if (!priv->broadcast) {
@@ -568,7 +559,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
568 } 559 }
569 560
570 if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { 561 if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
571 ipoib_mcast_join(dev, priv->broadcast, 0); 562 if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
563 ipoib_mcast_join(dev, priv->broadcast, 0);
572 return; 564 return;
573 } 565 }
574 566
@@ -625,26 +617,9 @@ int ipoib_mcast_start_thread(struct net_device *dev)
625 return 0; 617 return 0;
626} 618}
627 619
628static void wait_for_mcast_join(struct ipoib_dev_priv *priv,
629 struct ipoib_mcast *mcast)
630{
631 spin_lock_irq(&priv->lock);
632 if (mcast && mcast->query) {
633 ib_sa_cancel_query(mcast->query_id, mcast->query);
634 mcast->query = NULL;
635 spin_unlock_irq(&priv->lock);
636 ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
637 IPOIB_GID_ARG(mcast->mcmember.mgid));
638 wait_for_completion(&mcast->done);
639 }
640 else
641 spin_unlock_irq(&priv->lock);
642}
643
644int ipoib_mcast_stop_thread(struct net_device *dev, int flush) 620int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
645{ 621{
646 struct ipoib_dev_priv *priv = netdev_priv(dev); 622 struct ipoib_dev_priv *priv = netdev_priv(dev);
647 struct ipoib_mcast *mcast;
648 623
649 ipoib_dbg_mcast(priv, "stopping multicast thread\n"); 624 ipoib_dbg_mcast(priv, "stopping multicast thread\n");
650 625
@@ -660,52 +635,27 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
660 if (flush) 635 if (flush)
661 flush_workqueue(ipoib_workqueue); 636 flush_workqueue(ipoib_workqueue);
662 637
663 wait_for_mcast_join(priv, priv->broadcast);
664
665 list_for_each_entry(mcast, &priv->multicast_list, list)
666 wait_for_mcast_join(priv, mcast);
667
668 return 0; 638 return 0;
669} 639}
670 640
671static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) 641static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
672{ 642{
673 struct ipoib_dev_priv *priv = netdev_priv(dev); 643 struct ipoib_dev_priv *priv = netdev_priv(dev);
674 struct ib_sa_mcmember_rec rec = {
675 .join_state = 1
676 };
677 int ret = 0; 644 int ret = 0;
678 645
679 if (!test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) 646 if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
680 return 0; 647 ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
681 648 IPOIB_GID_ARG(mcast->mcmember.mgid));
682 ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
683 IPOIB_GID_ARG(mcast->mcmember.mgid));
684
685 rec.mgid = mcast->mcmember.mgid;
686 rec.port_gid = priv->local_gid;
687 rec.pkey = cpu_to_be16(priv->pkey);
688 649
689 /* Remove ourselves from the multicast group */ 650 /* Remove ourselves from the multicast group */
690 ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid), 651 ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
691 &mcast->mcmember.mgid); 652 &mcast->mcmember.mgid);
692 if (ret) 653 if (ret)
693 ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret); 654 ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
655 }
694 656
695 /* 657 if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
696 * Just make one shot at leaving and don't wait for a reply; 658 ib_sa_free_multicast(mcast->mc);
697 * if we fail, too bad.
698 */
699 ret = ib_sa_mcmember_rec_delete(&ipoib_sa_client, priv->ca, priv->port, &rec,
700 IB_SA_MCMEMBER_REC_MGID |
701 IB_SA_MCMEMBER_REC_PORT_GID |
702 IB_SA_MCMEMBER_REC_PKEY |
703 IB_SA_MCMEMBER_REC_JOIN_STATE,
704 0, GFP_ATOMIC, NULL,
705 mcast, &mcast->query);
706 if (ret < 0)
707 ipoib_warn(priv, "ib_sa_mcmember_rec_delete failed "
708 "for leave (result = %d)\n", ret);
709 659
710 return 0; 660 return 0;
711} 661}
@@ -758,7 +708,7 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
758 dev_kfree_skb_any(skb); 708 dev_kfree_skb_any(skb);
759 } 709 }
760 710
761 if (mcast->query) 711 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
762 ipoib_dbg_mcast(priv, "no address vector, " 712 ipoib_dbg_mcast(priv, "no address vector, "
763 "but multicast join already started\n"); 713 "but multicast join already started\n");
764 else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) 714 else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
@@ -916,7 +866,6 @@ void ipoib_mcast_restart_task(struct work_struct *work)
916 866
917 /* We have to cancel outside of the spinlock */ 867 /* We have to cancel outside of the spinlock */
918 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { 868 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
919 wait_for_mcast_join(priv, mcast);
920 ipoib_mcast_leave(mcast->dev, mcast); 869 ipoib_mcast_leave(mcast->dev, mcast);
921 ipoib_mcast_free(mcast); 870 ipoib_mcast_free(mcast);
922 } 871 }
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index c094e5012862..c36750ff6ae8 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -110,6 +110,12 @@ static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
110 dev_addr->broadcast[9] = (unsigned char) pkey; 110 dev_addr->broadcast[9] = (unsigned char) pkey;
111} 111}
112 112
113static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr,
114 union ib_gid *gid)
115{
116 memcpy(gid, dev_addr->broadcast + 4, sizeof *gid);
117}
118
113static inline void ib_addr_get_sgid(struct rdma_dev_addr *dev_addr, 119static inline void ib_addr_get_sgid(struct rdma_dev_addr *dev_addr,
114 union ib_gid *gid) 120 union ib_gid *gid)
115{ 121{
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 97715b0c20b6..5e26b2f53f86 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -285,18 +285,6 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
285 void *context, 285 void *context,
286 struct ib_sa_query **query); 286 struct ib_sa_query **query);
287 287
288int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
289 struct ib_device *device, u8 port_num,
290 u8 method,
291 struct ib_sa_mcmember_rec *rec,
292 ib_sa_comp_mask comp_mask,
293 int timeout_ms, gfp_t gfp_mask,
294 void (*callback)(int status,
295 struct ib_sa_mcmember_rec *resp,
296 void *context),
297 void *context,
298 struct ib_sa_query **query);
299
300int ib_sa_service_rec_query(struct ib_sa_client *client, 288int ib_sa_service_rec_query(struct ib_sa_client *client,
301 struct ib_device *device, u8 port_num, 289 struct ib_device *device, u8 port_num,
302 u8 method, 290 u8 method,
@@ -309,93 +297,82 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
309 void *context, 297 void *context,
310 struct ib_sa_query **sa_query); 298 struct ib_sa_query **sa_query);
311 299
300struct ib_sa_multicast {
301 struct ib_sa_mcmember_rec rec;
302 ib_sa_comp_mask comp_mask;
303 int (*callback)(int status,
304 struct ib_sa_multicast *multicast);
305 void *context;
306};
307
312/** 308/**
313 * ib_sa_mcmember_rec_set - Start an MCMember set query 309 * ib_sa_join_multicast - Initiates a join request to the specified multicast
314 * @client:SA client 310 * group.
315 * @device:device to send query on 311 * @client: SA client
316 * @port_num: port number to send query on 312 * @device: Device associated with the multicast group.
317 * @rec:MCMember Record to send in query 313 * @port_num: Port on the specified device to associate with the multicast
318 * @comp_mask:component mask to send in query 314 * group.
319 * @timeout_ms:time to wait for response 315 * @rec: SA multicast member record specifying group attributes.
320 * @gfp_mask:GFP mask to use for internal allocations 316 * @comp_mask: Component mask indicating which group attributes of %rec are
321 * @callback:function called when query completes, times out or is 317 * valid.
322 * canceled 318 * @gfp_mask: GFP mask for memory allocations.
323 * @context:opaque user context passed to callback 319 * @callback: User callback invoked once the join operation completes.
324 * @sa_query:query context, used to cancel query 320 * @context: User specified context stored with the ib_sa_multicast structure.
325 * 321 *
326 * Send an MCMember Set query to the SA (eg to join a multicast 322 * This call initiates a multicast join request with the SA for the specified
327 * group). The callback function will be called when the query 323 * multicast group. If the join operation is started successfully, it returns
328 * completes (or fails); status is 0 for a successful response, -EINTR 324 * an ib_sa_multicast structure that is used to track the multicast operation.
329 * if the query is canceled, -ETIMEDOUT is the query timed out, or 325 * Users must free this structure by calling ib_free_multicast, even if the
330 * -EIO if an error occurred sending the query. The resp parameter of 326 * join operation later fails. (The callback status is non-zero.)
331 * the callback is only valid if status is 0.
332 * 327 *
333 * If the return value of ib_sa_mcmember_rec_set() is negative, it is 328 * If the join operation fails; status will be non-zero, with the following
334 * an error code. Otherwise it is a query ID that can be used to 329 * failures possible:
335 * cancel the query. 330 * -ETIMEDOUT: The request timed out.
331 * -EIO: An error occurred sending the query.
332 * -EINVAL: The MCMemberRecord values differed from the existing group's.
333 * -ENETRESET: Indicates that an fatal error has occurred on the multicast
334 * group, and the user must rejoin the group to continue using it.
336 */ 335 */
337static inline int 336struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
338ib_sa_mcmember_rec_set(struct ib_sa_client *client, 337 struct ib_device *device, u8 port_num,
339 struct ib_device *device, u8 port_num, 338 struct ib_sa_mcmember_rec *rec,
340 struct ib_sa_mcmember_rec *rec, 339 ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
341 ib_sa_comp_mask comp_mask, 340 int (*callback)(int status,
342 int timeout_ms, gfp_t gfp_mask, 341 struct ib_sa_multicast
343 void (*callback)(int status, 342 *multicast),
344 struct ib_sa_mcmember_rec *resp, 343 void *context);
345 void *context),
346 void *context,
347 struct ib_sa_query **query)
348{
349 return ib_sa_mcmember_rec_query(client, device, port_num,
350 IB_MGMT_METHOD_SET,
351 rec, comp_mask,
352 timeout_ms, gfp_mask, callback,
353 context, query);
354}
355 344
356/** 345/**
357 * ib_sa_mcmember_rec_delete - Start an MCMember delete query 346 * ib_free_multicast - Frees the multicast tracking structure, and releases
358 * @client:SA client 347 * any reference on the multicast group.
359 * @device:device to send query on 348 * @multicast: Multicast tracking structure allocated by ib_join_multicast.
360 * @port_num: port number to send query on
361 * @rec:MCMember Record to send in query
362 * @comp_mask:component mask to send in query
363 * @timeout_ms:time to wait for response
364 * @gfp_mask:GFP mask to use for internal allocations
365 * @callback:function called when query completes, times out or is
366 * canceled
367 * @context:opaque user context passed to callback
368 * @sa_query:query context, used to cancel query
369 *
370 * Send an MCMember Delete query to the SA (eg to leave a multicast
371 * group). The callback function will be called when the query
372 * completes (or fails); status is 0 for a successful response, -EINTR
373 * if the query is canceled, -ETIMEDOUT is the query timed out, or
374 * -EIO if an error occurred sending the query. The resp parameter of
375 * the callback is only valid if status is 0.
376 * 349 *
377 * If the return value of ib_sa_mcmember_rec_delete() is negative, it 350 * This call blocks until the multicast identifier is destroyed. It may
378 * is an error code. Otherwise it is a query ID that can be used to 351 * not be called from within the multicast callback; however, returning a non-
379 * cancel the query. 352 * zero value from the callback will result in destroying the multicast
353 * tracking structure.
354 */
355void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
356
357/**
358 * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
359 * returns it if found.
360 * @device: Device associated with the multicast group.
361 * @port_num: Port on the specified device to associate with the multicast
362 * group.
363 * @mgid: MGID of multicast group.
364 * @rec: Location to copy SA multicast member record.
380 */ 365 */
381static inline int 366int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
382ib_sa_mcmember_rec_delete(struct ib_sa_client *client, 367 union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
383 struct ib_device *device, u8 port_num, 368
384 struct ib_sa_mcmember_rec *rec, 369/**
385 ib_sa_comp_mask comp_mask, 370 * ib_init_ah_from_mcmember - Initialize address handle attributes based on
386 int timeout_ms, gfp_t gfp_mask, 371 * an SA multicast member record.
387 void (*callback)(int status, 372 */
388 struct ib_sa_mcmember_rec *resp, 373int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
389 void *context), 374 struct ib_sa_mcmember_rec *rec,
390 void *context, 375 struct ib_ah_attr *ah_attr);
391 struct ib_sa_query **query)
392{
393 return ib_sa_mcmember_rec_query(client, device, port_num,
394 IB_SA_METHOD_DELETE,
395 rec, comp_mask,
396 timeout_ms, gfp_mask, callback,
397 context, query);
398}
399 376
400/** 377/**
401 * ib_init_ah_from_path - Initialize address handle attributes based on an SA 378 * ib_init_ah_from_path - Initialize address handle attributes based on an SA