aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOren Duer <oren@mellanox.co.il>2012-08-03 04:40:46 -0400
committerRoland Dreier <roland@purestorage.com>2012-09-30 23:33:35 -0400
commitb9c5d6a643589ad39064f652938baa698f0e884a (patch)
treeaa7d5599f03e9c0c01945d06d944f11e294c8e92
parent0a9a01884d447c216eff75f8f274a0a3e82c7cee (diff)
IB/mlx4: Add multicast group (MCG) paravirtualization for SR-IOV
MCG paravirtualization support includes: - Creating multicast groups by VFs, and keeping accounting of them - Leaving multicast groups by VFs - Updating SM only with real changes in the overall picture of MCGs status - Creation of MGID=0 groups (let SM choose MGID) Note that the MCG module maintains its own internal MCG object reference counts. The reason for this is that the IB core is used to track only the multicast groups joins generated by the PF it runs over. The PF IB core layer is unaware of slaves, so it cannot be used to keep track of MCG joins they generate. Signed-off-by: Oren Duer <oren@mellanox.co.il> Signed-off-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il> Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r--drivers/infiniband/hw/mlx4/Makefile2
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c60
-rw-r--r--drivers/infiniband/hw/mlx4/main.c18
-rw-r--r--drivers/infiniband/hw/mlx4/mcg.c1187
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h30
5 files changed, 1285 insertions, 12 deletions
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
index 70f09c7826da..20d627d1f046 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
1obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o 1obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o
2 2
3mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o 3mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index ba2580693f79..29ed3b43e4a6 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -75,6 +75,14 @@ struct mlx4_rcv_tunnel_mad {
75 struct ib_mad mad; 75 struct ib_mad mad;
76} __packed; 76} __packed;
77 77
78static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
79
80__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
81{
82 return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
83 cpu_to_be64(0xff00000000000000LL);
84}
85
78int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, 86int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
79 int port, struct ib_wc *in_wc, struct ib_grh *in_grh, 87 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
80 void *in_mad, void *response_mad) 88 void *in_mad, void *response_mad)
@@ -209,8 +217,7 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
209 pinfo->neighbormtu_mastersmsl & 0xf); 217 pinfo->neighbormtu_mastersmsl & 0xf);
210 218
211 if (pinfo->clientrereg_resv_subnetto & 0x80) 219 if (pinfo->clientrereg_resv_subnetto & 0x80)
212 mlx4_ib_dispatch_event(dev, port_num, 220 handle_client_rereg_event(dev, port_num);
213 IB_EVENT_CLIENT_REREGISTER);
214 221
215 if (prev_lid != lid) 222 if (prev_lid != lid)
216 mlx4_ib_dispatch_event(dev, port_num, 223 mlx4_ib_dispatch_event(dev, port_num,
@@ -308,7 +315,17 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma
308static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, 315static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave,
309 struct ib_sa_mad *sa_mad) 316 struct ib_sa_mad *sa_mad)
310{ 317{
311 return 0; 318 int ret = 0;
319
320 /* dispatch to different sa handlers */
321 switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
322 case IB_SA_ATTR_MC_MEMBER_REC:
323 ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
324 break;
325 default:
326 break;
327 }
328 return ret;
312} 329}
313 330
314int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) 331int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
@@ -768,6 +785,16 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
768 } 785 }
769} 786}
770 787
788static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
789{
790 /* re-configure the mcg's */
791 if (mlx4_is_master(dev->dev)) {
792 if (!dev->sriov.is_going_down)
793 mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
794 }
795 mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
796}
797
771void handle_port_mgmt_change_event(struct work_struct *work) 798void handle_port_mgmt_change_event(struct work_struct *work)
772{ 799{
773 struct ib_event_work *ew = container_of(work, struct ib_event_work, work); 800 struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
@@ -797,8 +824,7 @@ void handle_port_mgmt_change_event(struct work_struct *work)
797 mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); 824 mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
798 825
799 if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) 826 if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
800 mlx4_ib_dispatch_event(dev, port, 827 handle_client_rereg_event(dev, port);
801 IB_EVENT_CLIENT_REREGISTER);
802 break; 828 break;
803 829
804 case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: 830 case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
@@ -868,7 +894,17 @@ static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
868static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, 894static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
869 int slave, struct ib_sa_mad *sa_mad) 895 int slave, struct ib_sa_mad *sa_mad)
870{ 896{
871 return 0; 897 int ret = 0;
898
899 /* dispatch to different sa handlers */
900 switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
901 case IB_SA_ATTR_MC_MEMBER_REC:
902 ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad);
903 break;
904 default:
905 break;
906 }
907 return ret;
872} 908}
873 909
874static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) 910static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
@@ -1590,6 +1626,7 @@ static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave,
1590 int ret = 0; 1626 int ret = 0;
1591 1627
1592 if (!do_init) { 1628 if (!do_init) {
1629 clean_vf_mcast(&dev->sriov.demux[port - 1], slave);
1593 /* for master, destroy real sqp resources */ 1630 /* for master, destroy real sqp resources */
1594 if (slave == mlx4_master_func_num(dev->dev)) 1631 if (slave == mlx4_master_func_num(dev->dev))
1595 destroy_pv_resources(dev, slave, port, 1632 destroy_pv_resources(dev, slave, port,
@@ -1643,10 +1680,16 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
1643 ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); 1680 ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
1644 if (ret) { 1681 if (ret) {
1645 ret = -ENOMEM; 1682 ret = -ENOMEM;
1646 goto err_wq; 1683 goto err_mcg;
1647 } 1684 }
1648 } 1685 }
1649 1686
1687 ret = mlx4_ib_mcg_port_init(ctx);
1688 if (ret) {
1689 pr_err("Failed initializing mcg para-virt (%d)\n", ret);
1690 goto err_mcg;
1691 }
1692
1650 snprintf(name, sizeof name, "mlx4_ibt%d", port); 1693 snprintf(name, sizeof name, "mlx4_ibt%d", port);
1651 ctx->wq = create_singlethread_workqueue(name); 1694 ctx->wq = create_singlethread_workqueue(name);
1652 if (!ctx->wq) { 1695 if (!ctx->wq) {
@@ -1670,6 +1713,8 @@ err_udwq:
1670 ctx->wq = NULL; 1713 ctx->wq = NULL;
1671 1714
1672err_wq: 1715err_wq:
1716 mlx4_ib_mcg_port_cleanup(ctx, 1);
1717err_mcg:
1673 for (i = 0; i < dev->dev->caps.sqp_demux; i++) 1718 for (i = 0; i < dev->dev->caps.sqp_demux; i++)
1674 free_pv_object(dev, i, port); 1719 free_pv_object(dev, i, port);
1675 kfree(ctx->tun); 1720 kfree(ctx->tun);
@@ -1705,6 +1750,7 @@ static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
1705 int i; 1750 int i;
1706 if (ctx) { 1751 if (ctx) {
1707 struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); 1752 struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
1753 mlx4_ib_mcg_port_cleanup(ctx, 1);
1708 for (i = 0; i < dev->dev->caps.sqp_demux; i++) { 1754 for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
1709 if (!ctx->tun[i]) 1755 if (!ctx->tun[i])
1710 continue; 1756 continue;
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 45a6cc04036b..b959fe4665dd 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1628,18 +1628,28 @@ static int __init mlx4_ib_init(void)
1628 if (!wq) 1628 if (!wq)
1629 return -ENOMEM; 1629 return -ENOMEM;
1630 1630
1631 err = mlx4_ib_mcg_init();
1632 if (err)
1633 goto clean_wq;
1634
1631 err = mlx4_register_interface(&mlx4_ib_interface); 1635 err = mlx4_register_interface(&mlx4_ib_interface);
1632 if (err) { 1636 if (err)
1633 destroy_workqueue(wq); 1637 goto clean_mcg;
1634 return err;
1635 }
1636 1638
1637 return 0; 1639 return 0;
1640
1641clean_mcg:
1642 mlx4_ib_mcg_destroy();
1643
1644clean_wq:
1645 destroy_workqueue(wq);
1646 return err;
1638} 1647}
1639 1648
1640static void __exit mlx4_ib_cleanup(void) 1649static void __exit mlx4_ib_cleanup(void)
1641{ 1650{
1642 mlx4_unregister_interface(&mlx4_ib_interface); 1651 mlx4_unregister_interface(&mlx4_ib_interface);
1652 mlx4_ib_mcg_destroy();
1643 destroy_workqueue(wq); 1653 destroy_workqueue(wq);
1644} 1654}
1645 1655
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
new file mode 100644
index 000000000000..1ee2e3a3347a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -0,0 +1,1187 @@
1/*
2 * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <rdma/ib_mad.h>
34#include <rdma/ib_smi.h>
35#include <rdma/ib_cache.h>
36#include <rdma/ib_sa.h>
37
38#include <linux/mlx4/cmd.h>
39#include <linux/rbtree.h>
40#include <linux/delay.h>
41
42#include "mlx4_ib.h"
43
44#define MAX_VFS 80
45#define MAX_PEND_REQS_PER_FUNC 4
46#define MAD_TIMEOUT_MS 2000
47
48#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg)
49#define mcg_error(fmt, arg...) pr_err(fmt, ##arg)
50#define mcg_warn_group(group, format, arg...) \
51 pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
52 (group)->name, group->demux->port, ## arg)
53
54#define mcg_error_group(group, format, arg...) \
55 pr_err(" %16s: " format, (group)->name, ## arg)
56
57
58static union ib_gid mgid0;
59
60static struct workqueue_struct *clean_wq;
61
62enum mcast_state {
63 MCAST_NOT_MEMBER = 0,
64 MCAST_MEMBER,
65};
66
67enum mcast_group_state {
68 MCAST_IDLE,
69 MCAST_JOIN_SENT,
70 MCAST_LEAVE_SENT,
71 MCAST_RESP_READY
72};
73
74struct mcast_member {
75 enum mcast_state state;
76 uint8_t join_state;
77 int num_pend_reqs;
78 struct list_head pending;
79};
80
81struct ib_sa_mcmember_data {
82 union ib_gid mgid;
83 union ib_gid port_gid;
84 __be32 qkey;
85 __be16 mlid;
86 u8 mtusel_mtu;
87 u8 tclass;
88 __be16 pkey;
89 u8 ratesel_rate;
90 u8 lifetmsel_lifetm;
91 __be32 sl_flowlabel_hoplimit;
92 u8 scope_join_state;
93 u8 proxy_join;
94 u8 reserved[2];
95};
96
97struct mcast_group {
98 struct ib_sa_mcmember_data rec;
99 struct rb_node node;
100 struct list_head mgid0_list;
101 struct mlx4_ib_demux_ctx *demux;
102 struct mcast_member func[MAX_VFS];
103 struct mutex lock;
104 struct work_struct work;
105 struct list_head pending_list;
106 int members[3];
107 enum mcast_group_state state;
108 enum mcast_group_state prev_state;
109 struct ib_sa_mad response_sa_mad;
110 __be64 last_req_tid;
111
112 char name[33]; /* MGID string */
113
114 /* refcount is the reference count for the following:
115 1. Each queued request
116 2. Each invocation of the worker thread
117 3. Membership of the port at the SA
118 */
119 atomic_t refcount;
120
121 /* delayed work to clean pending SM request */
122 struct delayed_work timeout_work;
123 struct list_head cleanup_list;
124};
125
126struct mcast_req {
127 int func;
128 struct ib_sa_mad sa_mad;
129 struct list_head group_list;
130 struct list_head func_list;
131 struct mcast_group *group;
132 int clean;
133};
134
135
136#define safe_atomic_dec(ref) \
137 do {\
138 if (atomic_dec_and_test(ref)) \
139 mcg_warn_group(group, "did not expect to reach zero\n"); \
140 } while (0)
141
142static const char *get_state_string(enum mcast_group_state state)
143{
144 switch (state) {
145 case MCAST_IDLE:
146 return "MCAST_IDLE";
147 case MCAST_JOIN_SENT:
148 return "MCAST_JOIN_SENT";
149 case MCAST_LEAVE_SENT:
150 return "MCAST_LEAVE_SENT";
151 case MCAST_RESP_READY:
152 return "MCAST_RESP_READY";
153 }
154 return "Invalid State";
155}
156
157static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
158 union ib_gid *mgid)
159{
160 struct rb_node *node = ctx->mcg_table.rb_node;
161 struct mcast_group *group;
162 int ret;
163
164 while (node) {
165 group = rb_entry(node, struct mcast_group, node);
166 ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
167 if (!ret)
168 return group;
169
170 if (ret < 0)
171 node = node->rb_left;
172 else
173 node = node->rb_right;
174 }
175 return NULL;
176}
177
178static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
179 struct mcast_group *group)
180{
181 struct rb_node **link = &ctx->mcg_table.rb_node;
182 struct rb_node *parent = NULL;
183 struct mcast_group *cur_group;
184 int ret;
185
186 while (*link) {
187 parent = *link;
188 cur_group = rb_entry(parent, struct mcast_group, node);
189
190 ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
191 sizeof group->rec.mgid);
192 if (ret < 0)
193 link = &(*link)->rb_left;
194 else if (ret > 0)
195 link = &(*link)->rb_right;
196 else
197 return cur_group;
198 }
199 rb_link_node(&group->node, parent, link);
200 rb_insert_color(&group->node, &ctx->mcg_table);
201 return NULL;
202}
203
204static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
205{
206 struct mlx4_ib_dev *dev = ctx->dev;
207 struct ib_ah_attr ah_attr;
208
209 spin_lock(&dev->sm_lock);
210 if (!dev->sm_ah[ctx->port - 1]) {
211 /* port is not yet Active, sm_ah not ready */
212 spin_unlock(&dev->sm_lock);
213 return -EAGAIN;
214 }
215 mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
216 spin_unlock(&dev->sm_lock);
217 return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port,
218 IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad);
219}
220
221static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
222 struct ib_mad *mad)
223{
224 struct mlx4_ib_dev *dev = ctx->dev;
225 struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
226 struct ib_wc wc;
227 struct ib_ah_attr ah_attr;
228
229 /* Our agent might not yet be registered when mads start to arrive */
230 if (!agent)
231 return -EAGAIN;
232
233 ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
234
235 wc.pkey_index = 0;
236 wc.sl = 0;
237 wc.dlid_path_bits = 0;
238 wc.port_num = ctx->port;
239 wc.slid = ah_attr.dlid; /* opensm lid */
240 wc.src_qp = 1;
241 return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
242}
243
244static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
245{
246 struct ib_sa_mad mad;
247 struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
248 int ret;
249
250 /* we rely on a mad request as arrived from a VF */
251 memcpy(&mad, sa_mad, sizeof mad);
252
253 /* fix port GID to be the real one (slave 0) */
254 sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
255
256 /* assign our own TID */
257 mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
258 group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
259
260 ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
261 /* set timeout handler */
262 if (!ret) {
263 /* calls mlx4_ib_mcg_timeout_handler */
264 queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
265 msecs_to_jiffies(MAD_TIMEOUT_MS));
266 }
267
268 return ret;
269}
270
271static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
272{
273 struct ib_sa_mad mad;
274 struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
275 int ret;
276
277 memset(&mad, 0, sizeof mad);
278 mad.mad_hdr.base_version = 1;
279 mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
280 mad.mad_hdr.class_version = 2;
281 mad.mad_hdr.method = IB_SA_METHOD_DELETE;
282 mad.mad_hdr.status = cpu_to_be16(0);
283 mad.mad_hdr.class_specific = cpu_to_be16(0);
284 mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
285 group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
286 mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
287 mad.mad_hdr.attr_mod = cpu_to_be32(0);
288 mad.sa_hdr.sm_key = 0x0;
289 mad.sa_hdr.attr_offset = cpu_to_be16(7);
290 mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
291 IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
292
293 *sa_data = group->rec;
294 sa_data->scope_join_state = join_state;
295
296 ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
297 if (ret)
298 group->state = MCAST_IDLE;
299
300 /* set timeout handler */
301 if (!ret) {
302 /* calls mlx4_ib_mcg_timeout_handler */
303 queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
304 msecs_to_jiffies(MAD_TIMEOUT_MS));
305 }
306
307 return ret;
308}
309
310static int send_reply_to_slave(int slave, struct mcast_group *group,
311 struct ib_sa_mad *req_sa_mad, u16 status)
312{
313 struct ib_sa_mad mad;
314 struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
315 struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
316 int ret;
317
318 memset(&mad, 0, sizeof mad);
319 mad.mad_hdr.base_version = 1;
320 mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
321 mad.mad_hdr.class_version = 2;
322 mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
323 mad.mad_hdr.status = cpu_to_be16(status);
324 mad.mad_hdr.class_specific = cpu_to_be16(0);
325 mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
326 *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
327 mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
328 mad.mad_hdr.attr_mod = cpu_to_be32(0);
329 mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
330 mad.sa_hdr.attr_offset = cpu_to_be16(7);
331 mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
332
333 *sa_data = group->rec;
334
335 /* reconstruct VF's requested join_state and port_gid */
336 sa_data->scope_join_state &= 0xf0;
337 sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
338 memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
339
340 ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
341 return ret;
342}
343
344static int check_selector(ib_sa_comp_mask comp_mask,
345 ib_sa_comp_mask selector_mask,
346 ib_sa_comp_mask value_mask,
347 u8 src_value, u8 dst_value)
348{
349 int err;
350 u8 selector = dst_value >> 6;
351 dst_value &= 0x3f;
352 src_value &= 0x3f;
353
354 if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
355 return 0;
356
357 switch (selector) {
358 case IB_SA_GT:
359 err = (src_value <= dst_value);
360 break;
361 case IB_SA_LT:
362 err = (src_value >= dst_value);
363 break;
364 case IB_SA_EQ:
365 err = (src_value != dst_value);
366 break;
367 default:
368 err = 0;
369 break;
370 }
371
372 return err;
373}
374
375static u16 cmp_rec(struct ib_sa_mcmember_data *src,
376 struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
377{
378 /* src is group record, dst is request record */
379 /* MGID must already match */
380 /* Port_GID we always replace to our Port_GID, so it is a match */
381
382#define MAD_STATUS_REQ_INVALID 0x0200
383 if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
384 return MAD_STATUS_REQ_INVALID;
385 if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
386 return MAD_STATUS_REQ_INVALID;
387 if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
388 IB_SA_MCMEMBER_REC_MTU,
389 src->mtusel_mtu, dst->mtusel_mtu))
390 return MAD_STATUS_REQ_INVALID;
391 if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
392 src->tclass != dst->tclass)
393 return MAD_STATUS_REQ_INVALID;
394 if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
395 return MAD_STATUS_REQ_INVALID;
396 if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
397 IB_SA_MCMEMBER_REC_RATE,
398 src->ratesel_rate, dst->ratesel_rate))
399 return MAD_STATUS_REQ_INVALID;
400 if (check_selector(comp_mask,
401 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
402 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
403 src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
404 return MAD_STATUS_REQ_INVALID;
405 if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
406 (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
407 (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
408 return MAD_STATUS_REQ_INVALID;
409 if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
410 (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
411 (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
412 return MAD_STATUS_REQ_INVALID;
413 if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
414 (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
415 (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
416 return MAD_STATUS_REQ_INVALID;
417 if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
418 (src->scope_join_state & 0xf0) !=
419 (dst->scope_join_state & 0xf0))
420 return MAD_STATUS_REQ_INVALID;
421
422 /* join_state checked separately, proxy_join ignored */
423
424 return 0;
425}
426
427/* release group, return 1 if this was last release and group is destroyed
428 * timout work is canceled sync */
429static int release_group(struct mcast_group *group, int from_timeout_handler)
430{
431 struct mlx4_ib_demux_ctx *ctx = group->demux;
432 int nzgroup;
433
434 mutex_lock(&ctx->mcg_table_lock);
435 mutex_lock(&group->lock);
436 if (atomic_dec_and_test(&group->refcount)) {
437 if (!from_timeout_handler) {
438 if (group->state != MCAST_IDLE &&
439 !cancel_delayed_work(&group->timeout_work)) {
440 atomic_inc(&group->refcount);
441 mutex_unlock(&group->lock);
442 mutex_unlock(&ctx->mcg_table_lock);
443 return 0;
444 }
445 }
446
447 nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
448 if (!list_empty(&group->pending_list))
449 mcg_warn_group(group, "releasing a group with non empty pending list\n");
450 if (nzgroup)
451 rb_erase(&group->node, &ctx->mcg_table);
452 list_del_init(&group->mgid0_list);
453 mutex_unlock(&group->lock);
454 mutex_unlock(&ctx->mcg_table_lock);
455 kfree(group);
456 return 1;
457 } else {
458 mutex_unlock(&group->lock);
459 mutex_unlock(&ctx->mcg_table_lock);
460 }
461 return 0;
462}
463
464static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
465{
466 int i;
467
468 for (i = 0; i < 3; i++, join_state >>= 1)
469 if (join_state & 0x1)
470 group->members[i] += inc;
471}
472
473static u8 get_leave_state(struct mcast_group *group)
474{
475 u8 leave_state = 0;
476 int i;
477
478 for (i = 0; i < 3; i++)
479 if (!group->members[i])
480 leave_state |= (1 << i);
481
482 return leave_state & (group->rec.scope_join_state & 7);
483}
484
485static int join_group(struct mcast_group *group, int slave, u8 join_mask)
486{
487 int ret = 0;
488 u8 join_state;
489
490 /* remove bits that slave is already member of, and adjust */
491 join_state = join_mask & (~group->func[slave].join_state);
492 adjust_membership(group, join_state, 1);
493 group->func[slave].join_state |= join_state;
494 if (group->func[slave].state != MCAST_MEMBER && join_state) {
495 group->func[slave].state = MCAST_MEMBER;
496 ret = 1;
497 }
498 return ret;
499}
500
501static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
502{
503 int ret = 0;
504
505 adjust_membership(group, leave_state, -1);
506 group->func[slave].join_state &= ~leave_state;
507 if (!group->func[slave].join_state) {
508 group->func[slave].state = MCAST_NOT_MEMBER;
509 ret = 1;
510 }
511 return ret;
512}
513
514static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
515{
516 if (group->func[slave].state != MCAST_MEMBER)
517 return MAD_STATUS_REQ_INVALID;
518
519 /* make sure we're not deleting unset bits */
520 if (~group->func[slave].join_state & leave_mask)
521 return MAD_STATUS_REQ_INVALID;
522
523 if (!leave_mask)
524 return MAD_STATUS_REQ_INVALID;
525
526 return 0;
527}
528
529static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
530{
531 struct delayed_work *delay = to_delayed_work(work);
532 struct mcast_group *group;
533 struct mcast_req *req = NULL;
534
535 group = container_of(delay, typeof(*group), timeout_work);
536
537 mutex_lock(&group->lock);
538 if (group->state == MCAST_JOIN_SENT) {
539 if (!list_empty(&group->pending_list)) {
540 req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
541 list_del(&req->group_list);
542 list_del(&req->func_list);
543 --group->func[req->func].num_pend_reqs;
544 mutex_unlock(&group->lock);
545 kfree(req);
546 if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
547 if (release_group(group, 1))
548 return;
549 } else {
550 kfree(group);
551 return;
552 }
553 mutex_lock(&group->lock);
554 } else
555 mcg_warn_group(group, "DRIVER BUG\n");
556 } else if (group->state == MCAST_LEAVE_SENT) {
557 if (group->rec.scope_join_state & 7)
558 group->rec.scope_join_state &= 0xf8;
559 group->state = MCAST_IDLE;
560 mutex_unlock(&group->lock);
561 if (release_group(group, 1))
562 return;
563 mutex_lock(&group->lock);
564 } else
565 mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
566 group->state = MCAST_IDLE;
567 atomic_inc(&group->refcount);
568 if (!queue_work(group->demux->mcg_wq, &group->work))
569 safe_atomic_dec(&group->refcount);
570
571 mutex_unlock(&group->lock);
572}
573
574static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
575 struct mcast_req *req)
576{
577 u16 status;
578
579 if (req->clean)
580 leave_mask = group->func[req->func].join_state;
581
582 status = check_leave(group, req->func, leave_mask);
583 if (!status)
584 leave_group(group, req->func, leave_mask);
585
586 if (!req->clean)
587 send_reply_to_slave(req->func, group, &req->sa_mad, status);
588 --group->func[req->func].num_pend_reqs;
589 list_del(&req->group_list);
590 list_del(&req->func_list);
591 kfree(req);
592 return 1;
593}
594
595static int handle_join_req(struct mcast_group *group, u8 join_mask,
596 struct mcast_req *req)
597{
598 u8 group_join_state = group->rec.scope_join_state & 7;
599 int ref = 0;
600 u16 status;
601 struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
602
603 if (join_mask == (group_join_state & join_mask)) {
604 /* port's membership need not change */
605 status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
606 if (!status)
607 join_group(group, req->func, join_mask);
608
609 --group->func[req->func].num_pend_reqs;
610 send_reply_to_slave(req->func, group, &req->sa_mad, status);
611 list_del(&req->group_list);
612 list_del(&req->func_list);
613 kfree(req);
614 ++ref;
615 } else {
616 /* port's membership needs to be updated */
617 group->prev_state = group->state;
618 if (send_join_to_wire(group, &req->sa_mad)) {
619 --group->func[req->func].num_pend_reqs;
620 list_del(&req->group_list);
621 list_del(&req->func_list);
622 kfree(req);
623 ref = 1;
624 group->state = group->prev_state;
625 } else
626 group->state = MCAST_JOIN_SENT;
627 }
628
629 return ref;
630}
631
632static void mlx4_ib_mcg_work_handler(struct work_struct *work)
633{
634 struct mcast_group *group;
635 struct mcast_req *req = NULL;
636 struct ib_sa_mcmember_data *sa_data;
637 u8 req_join_state;
638 int rc = 1; /* release_count - this is for the scheduled work */
639 u16 status;
640 u8 method;
641
642 group = container_of(work, typeof(*group), work);
643
644 mutex_lock(&group->lock);
645
646 /* First, let's see if a response from SM is waiting regarding this group.
647 * If so, we need to update the group's REC. If this is a bad response, we
648 * may need to send a bad response to a VF waiting for it. If VF is waiting
649 * and this is a good response, the VF will be answered later in this func. */
650 if (group->state == MCAST_RESP_READY) {
651 /* cancels mlx4_ib_mcg_timeout_handler */
652 cancel_delayed_work(&group->timeout_work);
653 status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
654 method = group->response_sa_mad.mad_hdr.method;
655 if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
656 mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
657 be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
658 be64_to_cpu(group->last_req_tid));
659 group->state = group->prev_state;
660 goto process_requests;
661 }
662 if (status) {
663 if (!list_empty(&group->pending_list))
664 req = list_first_entry(&group->pending_list,
665 struct mcast_req, group_list);
666 if ((method == IB_MGMT_METHOD_GET_RESP)) {
667 if (req) {
668 send_reply_to_slave(req->func, group, &req->sa_mad, status);
669 --group->func[req->func].num_pend_reqs;
670 list_del(&req->group_list);
671 list_del(&req->func_list);
672 kfree(req);
673 ++rc;
674 } else
675 mcg_warn_group(group, "no request for failed join\n");
676 } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
677 ++rc;
678 } else {
679 u8 resp_join_state;
680 u8 cur_join_state;
681
682 resp_join_state = ((struct ib_sa_mcmember_data *)
683 group->response_sa_mad.data)->scope_join_state & 7;
684 cur_join_state = group->rec.scope_join_state & 7;
685
686 if (method == IB_MGMT_METHOD_GET_RESP) {
687 /* successfull join */
688 if (!cur_join_state && resp_join_state)
689 --rc;
690 } else if (!resp_join_state)
691 ++rc;
692 memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
693 }
694 group->state = MCAST_IDLE;
695 }
696
697process_requests:
698 /* We should now go over pending join/leave requests, as long as we are idle. */
699 while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
700 req = list_first_entry(&group->pending_list, struct mcast_req,
701 group_list);
702 sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
703 req_join_state = sa_data->scope_join_state & 0x7;
704
705 /* For a leave request, we will immediately answer the VF, and
706 * update our internal counters. The actual leave will be sent
707 * to SM later, if at all needed. We dequeue the request now. */
708 if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
709 rc += handle_leave_req(group, req_join_state, req);
710 else
711 rc += handle_join_req(group, req_join_state, req);
712 }
713
714 /* Handle leaves */
715 if (group->state == MCAST_IDLE) {
716 req_join_state = get_leave_state(group);
717 if (req_join_state) {
718 group->rec.scope_join_state &= ~req_join_state;
719 group->prev_state = group->state;
720 if (send_leave_to_wire(group, req_join_state)) {
721 group->state = group->prev_state;
722 ++rc;
723 } else
724 group->state = MCAST_LEAVE_SENT;
725 }
726 }
727
728 if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
729 goto process_requests;
730 mutex_unlock(&group->lock);
731
732 while (rc--)
733 release_group(group, 0);
734}
735
736static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
737 __be64 tid,
738 union ib_gid *new_mgid)
739{
740 struct mcast_group *group = NULL, *cur_group;
741 struct mcast_req *req;
742 struct list_head *pos;
743 struct list_head *n;
744
745 mutex_lock(&ctx->mcg_table_lock);
746 list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) {
747 group = list_entry(pos, struct mcast_group, mgid0_list);
748 mutex_lock(&group->lock);
749 if (group->last_req_tid == tid) {
750 if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
751 group->rec.mgid = *new_mgid;
752 sprintf(group->name, "%016llx%016llx",
753 be64_to_cpu(group->rec.mgid.global.subnet_prefix),
754 be64_to_cpu(group->rec.mgid.global.interface_id));
755 list_del_init(&group->mgid0_list);
756 cur_group = mcast_insert(ctx, group);
757 if (cur_group) {
758 /* A race between our code and SM. Silently cleaning the new one */
759 req = list_first_entry(&group->pending_list,
760 struct mcast_req, group_list);
761 --group->func[req->func].num_pend_reqs;
762 list_del(&req->group_list);
763 list_del(&req->func_list);
764 kfree(req);
765 mutex_unlock(&group->lock);
766 mutex_unlock(&ctx->mcg_table_lock);
767 release_group(group, 0);
768 return NULL;
769 }
770
771 atomic_inc(&group->refcount);
772 mutex_unlock(&group->lock);
773 mutex_unlock(&ctx->mcg_table_lock);
774 return group;
775 } else {
776 struct mcast_req *tmp1, *tmp2;
777
778 list_del(&group->mgid0_list);
779 if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
780 cancel_delayed_work_sync(&group->timeout_work);
781
782 list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
783 list_del(&tmp1->group_list);
784 kfree(tmp1);
785 }
786 mutex_unlock(&group->lock);
787 mutex_unlock(&ctx->mcg_table_lock);
788 kfree(group);
789 return NULL;
790 }
791 }
792 mutex_unlock(&group->lock);
793 }
794 mutex_unlock(&ctx->mcg_table_lock);
795
796 return NULL;
797}
798
799static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
800 union ib_gid *mgid, int create,
801 gfp_t gfp_mask)
802{
803 struct mcast_group *group, *cur_group;
804 int is_mgid0;
805 int i;
806
807 is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
808 if (!is_mgid0) {
809 group = mcast_find(ctx, mgid);
810 if (group)
811 goto found;
812 }
813
814 if (!create)
815 return ERR_PTR(-ENOENT);
816
817 group = kzalloc(sizeof *group, gfp_mask);
818 if (!group)
819 return ERR_PTR(-ENOMEM);
820
821 group->demux = ctx;
822 group->rec.mgid = *mgid;
823 INIT_LIST_HEAD(&group->pending_list);
824 INIT_LIST_HEAD(&group->mgid0_list);
825 for (i = 0; i < MAX_VFS; ++i)
826 INIT_LIST_HEAD(&group->func[i].pending);
827 INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
828 INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
829 mutex_init(&group->lock);
830 sprintf(group->name, "%016llx%016llx",
831 be64_to_cpu(group->rec.mgid.global.subnet_prefix),
832 be64_to_cpu(group->rec.mgid.global.interface_id));
833 group->state = MCAST_IDLE;
834
835 if (is_mgid0) {
836 list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
837 goto found;
838 }
839
840 cur_group = mcast_insert(ctx, group);
841 if (cur_group) {
842 mcg_warn("group just showed up %s - confused\n", cur_group->name);
843 kfree(group);
844 return ERR_PTR(-EINVAL);
845 }
846
847found:
848 atomic_inc(&group->refcount);
849 return group;
850}
851
852static void queue_req(struct mcast_req *req)
853{
854 struct mcast_group *group = req->group;
855
856 atomic_inc(&group->refcount); /* for the request */
857 atomic_inc(&group->refcount); /* for scheduling the work */
858 list_add_tail(&req->group_list, &group->pending_list);
859 list_add_tail(&req->func_list, &group->func[req->func].pending);
860 /* calls mlx4_ib_mcg_work_handler */
861 if (!queue_work(group->demux->mcg_wq, &group->work))
862 safe_atomic_dec(&group->refcount);
863}
864
865int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
866 struct ib_sa_mad *mad)
867{
868 struct mlx4_ib_dev *dev = to_mdev(ibdev);
869 struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
870 struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
871 struct mcast_group *group;
872
873 switch (mad->mad_hdr.method) {
874 case IB_MGMT_METHOD_GET_RESP:
875 case IB_SA_METHOD_DELETE_RESP:
876 mutex_lock(&ctx->mcg_table_lock);
877 group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL);
878 mutex_unlock(&ctx->mcg_table_lock);
879 if (IS_ERR(group)) {
880 if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
881 __be64 tid = mad->mad_hdr.tid;
882 *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
883 group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
884 } else
885 group = NULL;
886 }
887
888 if (!group)
889 return 1;
890
891 mutex_lock(&group->lock);
892 group->response_sa_mad = *mad;
893 group->prev_state = group->state;
894 group->state = MCAST_RESP_READY;
895 /* calls mlx4_ib_mcg_work_handler */
896 atomic_inc(&group->refcount);
897 if (!queue_work(ctx->mcg_wq, &group->work))
898 safe_atomic_dec(&group->refcount);
899 mutex_unlock(&group->lock);
900 release_group(group, 0);
901 return 1; /* consumed */
902 case IB_MGMT_METHOD_SET:
903 case IB_SA_METHOD_GET_TABLE:
904 case IB_SA_METHOD_GET_TABLE_RESP:
905 case IB_SA_METHOD_DELETE:
906 return 0; /* not consumed, pass-through to guest over tunnel */
907 default:
908 mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
909 port, mad->mad_hdr.method);
910 return 1; /* consumed */
911 }
912}
913
914int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
915 int slave, struct ib_sa_mad *sa_mad)
916{
917 struct mlx4_ib_dev *dev = to_mdev(ibdev);
918 struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
919 struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
920 struct mcast_group *group;
921 struct mcast_req *req;
922 int may_create = 0;
923
924 if (ctx->flushing)
925 return -EAGAIN;
926
927 switch (sa_mad->mad_hdr.method) {
928 case IB_MGMT_METHOD_SET:
929 may_create = 1;
930 case IB_SA_METHOD_DELETE:
931 req = kzalloc(sizeof *req, GFP_KERNEL);
932 if (!req)
933 return -ENOMEM;
934
935 req->func = slave;
936 req->sa_mad = *sa_mad;
937
938 mutex_lock(&ctx->mcg_table_lock);
939 group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL);
940 mutex_unlock(&ctx->mcg_table_lock);
941 if (IS_ERR(group)) {
942 kfree(req);
943 return PTR_ERR(group);
944 }
945 mutex_lock(&group->lock);
946 if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
947 mutex_unlock(&group->lock);
948 mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
949 port, slave, MAX_PEND_REQS_PER_FUNC);
950 release_group(group, 0);
951 kfree(req);
952 return -ENOMEM;
953 }
954 ++group->func[slave].num_pend_reqs;
955 req->group = group;
956 queue_req(req);
957 mutex_unlock(&group->lock);
958 release_group(group, 0);
959 return 1; /* consumed */
960 case IB_SA_METHOD_GET_TABLE:
961 case IB_MGMT_METHOD_GET_RESP:
962 case IB_SA_METHOD_GET_TABLE_RESP:
963 case IB_SA_METHOD_DELETE_RESP:
964 return 0; /* not consumed, pass-through */
965 default:
966 mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
967 port, slave, sa_mad->mad_hdr.method);
968 return 1; /* consumed */
969 }
970}
971
972int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
973{
974 char name[20];
975
976 atomic_set(&ctx->tid, 0);
977 sprintf(name, "mlx4_ib_mcg%d", ctx->port);
978 ctx->mcg_wq = create_singlethread_workqueue(name);
979 if (!ctx->mcg_wq)
980 return -ENOMEM;
981
982 mutex_init(&ctx->mcg_table_lock);
983 ctx->mcg_table = RB_ROOT;
984 INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
985 ctx->flushing = 0;
986
987 return 0;
988}
989
990static void force_clean_group(struct mcast_group *group)
991{
992 struct mcast_req *req, *tmp
993 ;
994 list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
995 list_del(&req->group_list);
996 kfree(req);
997 }
998 rb_erase(&group->node, &group->demux->mcg_table);
999 kfree(group);
1000}
1001
1002static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
1003{
1004 int i;
1005 struct rb_node *p;
1006 struct mcast_group *group;
1007 unsigned long end;
1008 int count;
1009
1010 if (ctx->flushing)
1011 return;
1012
1013 ctx->flushing = 1;
1014 for (i = 0; i < MAX_VFS; ++i)
1015 clean_vf_mcast(ctx, i);
1016
1017 end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
1018 do {
1019 count = 0;
1020 mutex_lock(&ctx->mcg_table_lock);
1021 for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
1022 ++count;
1023 mutex_unlock(&ctx->mcg_table_lock);
1024 if (!count)
1025 break;
1026
1027 msleep(1);
1028 } while (time_after(end, jiffies));
1029
1030 flush_workqueue(ctx->mcg_wq);
1031 if (destroy_wq)
1032 destroy_workqueue(ctx->mcg_wq);
1033
1034 mutex_lock(&ctx->mcg_table_lock);
1035 while ((p = rb_first(&ctx->mcg_table)) != NULL) {
1036 group = rb_entry(p, struct mcast_group, node);
1037 if (atomic_read(&group->refcount))
1038 mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group);
1039
1040 force_clean_group(group);
1041 }
1042 mutex_unlock(&ctx->mcg_table_lock);
1043
1044 if (!destroy_wq)
1045 ctx->flushing = 0;
1046}
1047
1048struct clean_work {
1049 struct work_struct work;
1050 struct mlx4_ib_demux_ctx *ctx;
1051 int destroy_wq;
1052};
1053
1054static void mcg_clean_task(struct work_struct *work)
1055{
1056 struct clean_work *cw = container_of(work, struct clean_work, work);
1057
1058 _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
1059 kfree(cw);
1060}
1061
1062void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
1063{
1064 struct clean_work *work;
1065
1066 if (destroy_wq) {
1067 _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
1068 return;
1069 }
1070
1071 work = kmalloc(sizeof *work, GFP_KERNEL);
1072 if (!work) {
1073 mcg_warn("failed allocating work for cleanup\n");
1074 return;
1075 }
1076
1077 work->ctx = ctx;
1078 work->destroy_wq = destroy_wq;
1079 INIT_WORK(&work->work, mcg_clean_task);
1080 queue_work(clean_wq, &work->work);
1081}
1082
1083static void build_leave_mad(struct mcast_req *req)
1084{
1085 struct ib_sa_mad *mad = &req->sa_mad;
1086
1087 mad->mad_hdr.method = IB_SA_METHOD_DELETE;
1088}
1089
1090
1091static void clear_pending_reqs(struct mcast_group *group, int vf)
1092{
1093 struct mcast_req *req, *tmp, *group_first = NULL;
1094 int clear;
1095 int pend = 0;
1096
1097 if (!list_empty(&group->pending_list))
1098 group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
1099
1100 list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
1101 clear = 1;
1102 if (group_first == req &&
1103 (group->state == MCAST_JOIN_SENT ||
1104 group->state == MCAST_LEAVE_SENT)) {
1105 clear = cancel_delayed_work(&group->timeout_work);
1106 pend = !clear;
1107 group->state = MCAST_IDLE;
1108 }
1109 if (clear) {
1110 --group->func[vf].num_pend_reqs;
1111 list_del(&req->group_list);
1112 list_del(&req->func_list);
1113 kfree(req);
1114 atomic_dec(&group->refcount);
1115 }
1116 }
1117
1118 if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
1119 mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
1120 list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
1121 }
1122}
1123
1124static int push_deleteing_req(struct mcast_group *group, int slave)
1125{
1126 struct mcast_req *req;
1127 struct mcast_req *pend_req;
1128
1129 if (!group->func[slave].join_state)
1130 return 0;
1131
1132 req = kzalloc(sizeof *req, GFP_KERNEL);
1133 if (!req) {
1134 mcg_warn_group(group, "failed allocation - may leave stall groups\n");
1135 return -ENOMEM;
1136 }
1137
1138 if (!list_empty(&group->func[slave].pending)) {
1139 pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
1140 if (pend_req->clean) {
1141 kfree(req);
1142 return 0;
1143 }
1144 }
1145
1146 req->clean = 1;
1147 req->func = slave;
1148 req->group = group;
1149 ++group->func[slave].num_pend_reqs;
1150 build_leave_mad(req);
1151 queue_req(req);
1152 return 0;
1153}
1154
1155void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
1156{
1157 struct mcast_group *group;
1158 struct rb_node *p;
1159
1160 mutex_lock(&ctx->mcg_table_lock);
1161 for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
1162 group = rb_entry(p, struct mcast_group, node);
1163 mutex_lock(&group->lock);
1164 if (atomic_read(&group->refcount)) {
1165 /* clear pending requests of this VF */
1166 clear_pending_reqs(group, slave);
1167 push_deleteing_req(group, slave);
1168 }
1169 mutex_unlock(&group->lock);
1170 }
1171 mutex_unlock(&ctx->mcg_table_lock);
1172}
1173
1174
1175int mlx4_ib_mcg_init(void)
1176{
1177 clean_wq = create_singlethread_workqueue("mlx4_ib_mcg");
1178 if (!clean_wq)
1179 return -ENOMEM;
1180
1181 return 0;
1182}
1183
1184void mlx4_ib_mcg_destroy(void)
1185{
1186 destroy_workqueue(clean_wq);
1187}
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index ac71d56ffc7e..01ba9f1692b1 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -37,9 +37,11 @@
37#include <linux/compiler.h> 37#include <linux/compiler.h>
38#include <linux/list.h> 38#include <linux/list.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/idr.h>
40 41
41#include <rdma/ib_verbs.h> 42#include <rdma/ib_verbs.h>
42#include <rdma/ib_umem.h> 43#include <rdma/ib_umem.h>
44#include <rdma/ib_mad.h>
43 45
44#include <linux/mlx4/device.h> 46#include <linux/mlx4/device.h>
45#include <linux/mlx4/doorbell.h> 47#include <linux/mlx4/doorbell.h>
@@ -329,7 +331,14 @@ struct mlx4_ib_demux_ctx {
329 __be64 subnet_prefix; 331 __be64 subnet_prefix;
330 __be64 guid_cache[128]; 332 __be64 guid_cache[128];
331 struct mlx4_ib_dev *dev; 333 struct mlx4_ib_dev *dev;
334 /* the following lock protects both mcg_table and mcg_mgid0_list */
335 struct mutex mcg_table_lock;
336 struct rb_root mcg_table;
337 struct list_head mcg_mgid0_list;
338 struct workqueue_struct *mcg_wq;
332 struct mlx4_ib_demux_pv_ctx **tun; 339 struct mlx4_ib_demux_pv_ctx **tun;
340 atomic_t tid;
341 int flushing; /* flushing the work queue */
333}; 342};
334 343
335struct mlx4_ib_sriov { 344struct mlx4_ib_sriov {
@@ -553,6 +562,19 @@ static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
553 return !!(ah->av.ib.g_slid & 0x80); 562 return !!(ah->av.ib.g_slid & 0x80);
554} 563}
555 564
565int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
566void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
567void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
568int mlx4_ib_mcg_init(void);
569void mlx4_ib_mcg_destroy(void);
570
571int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid);
572
573int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
574 struct ib_sa_mad *sa_mad);
575int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
576 struct ib_sa_mad *mad);
577
556int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, 578int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
557 union ib_gid *gid); 579 union ib_gid *gid);
558 580
@@ -561,4 +583,12 @@ void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
561 583
562void mlx4_ib_tunnels_update_work(struct work_struct *work); 584void mlx4_ib_tunnels_update_work(struct work_struct *work);
563 585
586int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
587 enum ib_qp_type qpt, struct ib_wc *wc,
588 struct ib_grh *grh, struct ib_mad *mad);
589int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
590 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
591 u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad);
592__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
593
564#endif /* MLX4_IB_H */ 594#endif /* MLX4_IB_H */