aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Bloch <markb@mellanox.com>2016-05-19 10:12:36 -0400
committerDoug Ledford <dledford@redhat.com>2016-05-24 14:44:04 -0400
commitae43f8286730d1f2d241c34601df59f6d2286ac4 (patch)
treebeb629a68629c69511355426f91dde307908f2cb
parent735c631ae99d4b6cffc9e2774258329c526daa65 (diff)
IB/core: Add IP to GID netlink offload
There is an assumption that rdmacm is used only between nodes in the same IB subnet, this why ARP resolution can be used to turn IP to GID in rdmacm. When dealing with IB communication between subnets this assumption is no longer valid. ARP resolution will get us the next hop device address and not the peer node's device address. To solve this issue, we will check user space if it can provide the GID of the peer node, and fail if not. We add a sequence number to identify each request and fill in the GID upon answer from userspace. Signed-off-by: Mark Bloch <markb@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
-rw-r--r--drivers/infiniband/core/addr.c217
-rw-r--r--drivers/infiniband/core/core_priv.h2
-rw-r--r--drivers/infiniband/core/device.c3
3 files changed, 198 insertions, 24 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 3a203ee08ced..1374541a4528 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -46,6 +46,10 @@
46#include <net/ip6_route.h> 46#include <net/ip6_route.h>
47#include <rdma/ib_addr.h> 47#include <rdma/ib_addr.h>
48#include <rdma/ib.h> 48#include <rdma/ib.h>
49#include <rdma/rdma_netlink.h>
50#include <net/netlink.h>
51
52#include "core_priv.h"
49 53
50struct addr_req { 54struct addr_req {
51 struct list_head list; 55 struct list_head list;
@@ -58,8 +62,11 @@ struct addr_req {
58 struct rdma_dev_addr *addr, void *context); 62 struct rdma_dev_addr *addr, void *context);
59 unsigned long timeout; 63 unsigned long timeout;
60 int status; 64 int status;
65 u32 seq;
61}; 66};
62 67
68static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);
69
63static void process_req(struct work_struct *work); 70static void process_req(struct work_struct *work);
64 71
65static DEFINE_MUTEX(lock); 72static DEFINE_MUTEX(lock);
@@ -67,6 +74,126 @@ static LIST_HEAD(req_list);
67static DECLARE_DELAYED_WORK(work, process_req); 74static DECLARE_DELAYED_WORK(work, process_req);
68static struct workqueue_struct *addr_wq; 75static struct workqueue_struct *addr_wq;
69 76
77static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
78 [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
79 .len = sizeof(struct rdma_nla_ls_gid)},
80};
81
82static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
83{
84 struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
85 int ret;
86
87 if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
88 return false;
89
90 ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
91 nlmsg_len(nlh), ib_nl_addr_policy);
92 if (ret)
93 return false;
94
95 return true;
96}
97
98static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
99{
100 const struct nlattr *head, *curr;
101 union ib_gid gid;
102 struct addr_req *req;
103 int len, rem;
104 int found = 0;
105
106 head = (const struct nlattr *)nlmsg_data(nlh);
107 len = nlmsg_len(nlh);
108
109 nla_for_each_attr(curr, head, len, rem) {
110 if (curr->nla_type == LS_NLA_TYPE_DGID)
111 memcpy(&gid, nla_data(curr), nla_len(curr));
112 }
113
114 mutex_lock(&lock);
115 list_for_each_entry(req, &req_list, list) {
116 if (nlh->nlmsg_seq != req->seq)
117 continue;
118 /* We set the DGID part, the rest was set earlier */
119 rdma_addr_set_dgid(req->addr, &gid);
120 req->status = 0;
121 found = 1;
122 break;
123 }
124 mutex_unlock(&lock);
125
126 if (!found)
127 pr_info("Couldn't find request waiting for DGID: %pI6\n",
128 &gid);
129}
130
131int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
132 struct netlink_callback *cb)
133{
134 const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
135
136 if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
137 !(NETLINK_CB(skb).sk) ||
138 !netlink_capable(skb, CAP_NET_ADMIN))
139 return -EPERM;
140
141 if (ib_nl_is_good_ip_resp(nlh))
142 ib_nl_process_good_ip_rsep(nlh);
143
144 return skb->len;
145}
146
147static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
148 const void *daddr,
149 u32 seq, u16 family)
150{
151 struct sk_buff *skb = NULL;
152 struct nlmsghdr *nlh;
153 struct rdma_ls_ip_resolve_header *header;
154 void *data;
155 size_t size;
156 int attrtype;
157 int len;
158
159 if (family == AF_INET) {
160 size = sizeof(struct in_addr);
161 attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
162 } else {
163 size = sizeof(struct in6_addr);
164 attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
165 }
166
167 len = nla_total_size(sizeof(size));
168 len += NLMSG_ALIGN(sizeof(*header));
169
170 skb = nlmsg_new(len, GFP_KERNEL);
171 if (!skb)
172 return -ENOMEM;
173
174 data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
175 RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
176 if (!data) {
177 nlmsg_free(skb);
178 return -ENODATA;
179 }
180
181 /* Construct the family header first */
182 header = (struct rdma_ls_ip_resolve_header *)
183 skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
184 header->ifindex = dev_addr->bound_dev_if;
185 nla_put(skb, attrtype, size, daddr);
186
187 /* Repair the nlmsg header length */
188 nlmsg_end(skb, nlh);
189 ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
190
191 /* Make the request retry, so when we get the response from userspace
192 * we will have something.
193 */
194 return -ENODATA;
195}
196
70int rdma_addr_size(struct sockaddr *addr) 197int rdma_addr_size(struct sockaddr *addr)
71{ 198{
72 switch (addr->sa_family) { 199 switch (addr->sa_family) {
@@ -195,6 +322,17 @@ static void queue_req(struct addr_req *req)
195 mutex_unlock(&lock); 322 mutex_unlock(&lock);
196} 323}
197 324
325static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
326 const void *daddr, u32 seq, u16 family)
327{
328 if (ibnl_chk_listeners(RDMA_NL_GROUP_LS))
329 return -EADDRNOTAVAIL;
330
331 /* We fill in what we can, the response will fill the rest */
332 rdma_copy_addr(dev_addr, dst->dev, NULL);
333 return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
334}
335
198static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, 336static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
199 const void *daddr) 337 const void *daddr)
200{ 338{
@@ -219,6 +357,39 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
219 return ret; 357 return ret;
220} 358}
221 359
360static bool has_gateway(struct dst_entry *dst, sa_family_t family)
361{
362 struct rtable *rt;
363 struct rt6_info *rt6;
364
365 if (family == AF_INET) {
366 rt = container_of(dst, struct rtable, dst);
367 return rt->rt_uses_gateway;
368 }
369
370 rt6 = container_of(dst, struct rt6_info, dst);
371 return rt6->rt6i_flags & RTF_GATEWAY;
372}
373
374static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
375 const struct sockaddr *dst_in, u32 seq)
376{
377 const struct sockaddr_in *dst_in4 =
378 (const struct sockaddr_in *)dst_in;
379 const struct sockaddr_in6 *dst_in6 =
380 (const struct sockaddr_in6 *)dst_in;
381 const void *daddr = (dst_in->sa_family == AF_INET) ?
382 (const void *)&dst_in4->sin_addr.s_addr :
383 (const void *)&dst_in6->sin6_addr;
384 sa_family_t family = dst_in->sa_family;
385
386 /* Gateway + ARPHRD_INFINIBAND -> IB router */
387 if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
388 return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
389 else
390 return dst_fetch_ha(dst, dev_addr, daddr);
391}
392
222static int addr4_resolve(struct sockaddr_in *src_in, 393static int addr4_resolve(struct sockaddr_in *src_in,
223 const struct sockaddr_in *dst_in, 394 const struct sockaddr_in *dst_in,
224 struct rdma_dev_addr *addr, 395 struct rdma_dev_addr *addr,
@@ -242,10 +413,11 @@ static int addr4_resolve(struct sockaddr_in *src_in,
242 src_in->sin_family = AF_INET; 413 src_in->sin_family = AF_INET;
243 src_in->sin_addr.s_addr = fl4.saddr; 414 src_in->sin_addr.s_addr = fl4.saddr;
244 415
245 /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't 416 /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
246 * routable) and we could set the network type accordingly. 417 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
418 * type accordingly.
247 */ 419 */
248 if (rt->rt_uses_gateway) 420 if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
249 addr->network = RDMA_NETWORK_IPV4; 421 addr->network = RDMA_NETWORK_IPV4;
250 422
251 addr->hoplimit = ip4_dst_hoplimit(&rt->dst); 423 addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
@@ -287,10 +459,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
287 src_in->sin6_addr = fl6.saddr; 459 src_in->sin6_addr = fl6.saddr;
288 } 460 }
289 461
290 /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't 462 /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
291 * routable) and we could set the network type accordingly. 463 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
464 * type accordingly.
292 */ 465 */
293 if (rt->rt6i_flags & RTF_GATEWAY) 466 if (rt->rt6i_flags & RTF_GATEWAY &&
467 ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
294 addr->network = RDMA_NETWORK_IPV6; 468 addr->network = RDMA_NETWORK_IPV6;
295 469
296 addr->hoplimit = ip6_dst_hoplimit(dst); 470 addr->hoplimit = ip6_dst_hoplimit(dst);
@@ -313,7 +487,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
313 487
314static int addr_resolve_neigh(struct dst_entry *dst, 488static int addr_resolve_neigh(struct dst_entry *dst,
315 const struct sockaddr *dst_in, 489 const struct sockaddr *dst_in,
316 struct rdma_dev_addr *addr) 490 struct rdma_dev_addr *addr,
491 u32 seq)
317{ 492{
318 if (dst->dev->flags & IFF_LOOPBACK) { 493 if (dst->dev->flags & IFF_LOOPBACK) {
319 int ret; 494 int ret;
@@ -327,17 +502,8 @@ static int addr_resolve_neigh(struct dst_entry *dst,
327 } 502 }
328 503
329 /* If the device doesn't do ARP internally */ 504 /* If the device doesn't do ARP internally */
330 if (!(dst->dev->flags & IFF_NOARP)) { 505 if (!(dst->dev->flags & IFF_NOARP))
331 const struct sockaddr_in *dst_in4 = 506 return fetch_ha(dst, addr, dst_in, seq);
332 (const struct sockaddr_in *)dst_in;
333 const struct sockaddr_in6 *dst_in6 =
334 (const struct sockaddr_in6 *)dst_in;
335
336 return dst_fetch_ha(dst, addr,
337 dst_in->sa_family == AF_INET ?
338 (const void *)&dst_in4->sin_addr.s_addr :
339 (const void *)&dst_in6->sin6_addr);
340 }
341 507
342 return rdma_copy_addr(addr, dst->dev, NULL); 508 return rdma_copy_addr(addr, dst->dev, NULL);
343} 509}
@@ -345,7 +511,8 @@ static int addr_resolve_neigh(struct dst_entry *dst,
345static int addr_resolve(struct sockaddr *src_in, 511static int addr_resolve(struct sockaddr *src_in,
346 const struct sockaddr *dst_in, 512 const struct sockaddr *dst_in,
347 struct rdma_dev_addr *addr, 513 struct rdma_dev_addr *addr,
348 bool resolve_neigh) 514 bool resolve_neigh,
515 u32 seq)
349{ 516{
350 struct net_device *ndev; 517 struct net_device *ndev;
351 struct dst_entry *dst; 518 struct dst_entry *dst;
@@ -362,7 +529,7 @@ static int addr_resolve(struct sockaddr *src_in,
362 return ret; 529 return ret;
363 530
364 if (resolve_neigh) 531 if (resolve_neigh)
365 ret = addr_resolve_neigh(&rt->dst, dst_in, addr); 532 ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
366 533
367 ndev = rt->dst.dev; 534 ndev = rt->dst.dev;
368 dev_hold(ndev); 535 dev_hold(ndev);
@@ -379,7 +546,7 @@ static int addr_resolve(struct sockaddr *src_in,
379 return ret; 546 return ret;
380 547
381 if (resolve_neigh) 548 if (resolve_neigh)
382 ret = addr_resolve_neigh(dst, dst_in, addr); 549 ret = addr_resolve_neigh(dst, dst_in, addr, seq);
383 550
384 ndev = dst->dev; 551 ndev = dst->dev;
385 dev_hold(ndev); 552 dev_hold(ndev);
@@ -408,7 +575,7 @@ static void process_req(struct work_struct *work)
408 src_in = (struct sockaddr *) &req->src_addr; 575 src_in = (struct sockaddr *) &req->src_addr;
409 dst_in = (struct sockaddr *) &req->dst_addr; 576 dst_in = (struct sockaddr *) &req->dst_addr;
410 req->status = addr_resolve(src_in, dst_in, req->addr, 577 req->status = addr_resolve(src_in, dst_in, req->addr,
411 true); 578 true, req->seq);
412 if (req->status && time_after_eq(jiffies, req->timeout)) 579 if (req->status && time_after_eq(jiffies, req->timeout))
413 req->status = -ETIMEDOUT; 580 req->status = -ETIMEDOUT;
414 else if (req->status == -ENODATA) 581 else if (req->status == -ENODATA)
@@ -467,8 +634,9 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
467 req->context = context; 634 req->context = context;
468 req->client = client; 635 req->client = client;
469 atomic_inc(&client->refcount); 636 atomic_inc(&client->refcount);
637 req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
470 638
471 req->status = addr_resolve(src_in, dst_in, addr, true); 639 req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
472 switch (req->status) { 640 switch (req->status) {
473 case 0: 641 case 0:
474 req->timeout = jiffies; 642 req->timeout = jiffies;
@@ -506,7 +674,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr,
506 src_in->sa_family = dst_addr->sa_family; 674 src_in->sa_family = dst_addr->sa_family;
507 } 675 }
508 676
509 return addr_resolve(src_in, dst_addr, addr, false); 677 return addr_resolve(src_in, dst_addr, addr, false, 0);
510} 678}
511EXPORT_SYMBOL(rdma_resolve_ip_route); 679EXPORT_SYMBOL(rdma_resolve_ip_route);
512 680
@@ -638,6 +806,7 @@ int addr_init(void)
638 806
639 register_netevent_notifier(&nb); 807 register_netevent_notifier(&nb);
640 rdma_addr_register_client(&self); 808 rdma_addr_register_client(&self);
809
641 return 0; 810 return 0;
642} 811}
643 812
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 1ff334587e7b..19d499dcab76 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -150,5 +150,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb,
150 struct netlink_callback *cb); 150 struct netlink_callback *cb);
151int ib_nl_handle_set_timeout(struct sk_buff *skb, 151int ib_nl_handle_set_timeout(struct sk_buff *skb,
152 struct netlink_callback *cb); 152 struct netlink_callback *cb);
153int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
154 struct netlink_callback *cb);
153 155
154#endif /* _CORE_PRIV_H */ 156#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8b4366b2e358..5516fb070344 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -962,6 +962,9 @@ static struct ibnl_client_cbs ibnl_ls_cb_table[] = {
962 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 962 [RDMA_NL_LS_OP_SET_TIMEOUT] = {
963 .dump = ib_nl_handle_set_timeout, 963 .dump = ib_nl_handle_set_timeout,
964 .module = THIS_MODULE }, 964 .module = THIS_MODULE },
965 [RDMA_NL_LS_OP_IP_RESOLVE] = {
966 .dump = ib_nl_handle_ip_res_resp,
967 .module = THIS_MODULE },
965}; 968};
966 969
967static int ib_add_ibnl_clients(void) 970static int ib_add_ibnl_clients(void)