aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@mellanox.com>2018-08-16 16:13:03 -0400
committerJason Gunthorpe <jgg@mellanox.com>2018-08-16 16:21:29 -0400
commit0a3173a5f09bc58a3638ecfd0a80bdbae55e123c (patch)
treed6c0bc84863cca54dfbde3b7463e5d49c82af9f1 /net/rds
parent92f4e77c85918eab5e5803d7e28ab89a7e6bd3a2 (diff)
parent5c60a7389d795e001c8748b458eb76e3a5b6008c (diff)
Merge branch 'linus/master' into rdma.git for-next
rdma.git merge resolution for the 4.19 merge window Conflicts: drivers/infiniband/core/rdma_core.c - Use the rdma code and revise with the new spelling for atomic_fetch_add_unless drivers/nvme/host/rdma.c - Replace max_sge with max_send_sge in new blk code drivers/nvme/target/rdma.c - Use the blk code and revise to use NULL for ib_post_recv when appropriate - Replace max_sge with max_recv_sge in new blk code net/rds/ib_send.c - Use the net code and revise to use NULL for ib_post_recv when appropriate Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig2
-rw-r--r--net/rds/Makefile1
-rw-r--r--net/rds/af_rds.c205
-rw-r--r--net/rds/bind.c138
-rw-r--r--net/rds/cong.c23
-rw-r--r--net/rds/connection.c283
-rw-r--r--net/rds/ib.c136
-rw-r--r--net/rds/ib.h53
-rw-r--r--net/rds/ib_cm.c320
-rw-r--r--net/rds/ib_frmr.c1
-rw-r--r--net/rds/ib_mr.h2
-rw-r--r--net/rds/ib_rdma.c26
-rw-r--r--net/rds/ib_recv.c33
-rw-r--r--net/rds/ib_send.c13
-rw-r--r--net/rds/loop.c7
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/rdma.c6
-rw-r--r--net/rds/rdma_transport.c95
-rw-r--r--net/rds/rdma_transport.h5
-rw-r--r--net/rds/rds.h88
-rw-r--r--net/rds/recv.c78
-rw-r--r--net/rds/send.c116
-rw-r--r--net/rds/tcp.c154
-rw-r--r--net/rds/tcp.h2
-rw-r--r--net/rds/tcp_connect.c68
-rw-r--r--net/rds/tcp_listen.c87
-rw-r--r--net/rds/tcp_recv.c9
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c16
30 files changed, 1600 insertions, 441 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index bffde4b46c5d..01b3bd6a3708 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -16,6 +16,7 @@ config RDS_RDMA
16config RDS_TCP 16config RDS_TCP
17 tristate "RDS over TCP" 17 tristate "RDS over TCP"
18 depends on RDS 18 depends on RDS
19 depends on IPV6 || !IPV6
19 ---help--- 20 ---help---
20 Allow RDS to use TCP as a transport. 21 Allow RDS to use TCP as a transport.
21 This transport does not support RDMA operations. 22 This transport does not support RDMA operations.
@@ -24,4 +25,3 @@ config RDS_DEBUG
24 bool "RDS debugging messages" 25 bool "RDS debugging messages"
25 depends on RDS 26 depends on RDS
26 default n 27 default n
27
diff --git a/net/rds/Makefile b/net/rds/Makefile
index b5d568bd479c..e647f9de104a 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -15,4 +15,3 @@ rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
15 tcp_send.o tcp_stats.o 15 tcp_send.o tcp_stats.o
16 16
17ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG 17ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG
18
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index ab751a150f70..65387e1e6964 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/gfp.h> 36#include <linux/gfp.h>
37#include <linux/in.h> 37#include <linux/in.h>
38#include <linux/ipv6.h>
38#include <linux/poll.h> 39#include <linux/poll.h>
39#include <net/sock.h> 40#include <net/sock.h>
40 41
@@ -113,26 +114,82 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
113static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 114static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
114 int peer) 115 int peer)
115{ 116{
116 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
117 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 117 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
118 118 struct sockaddr_in6 *sin6;
119 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 119 struct sockaddr_in *sin;
120 int uaddr_len;
120 121
121 /* racey, don't care */ 122 /* racey, don't care */
122 if (peer) { 123 if (peer) {
123 if (!rs->rs_conn_addr) 124 if (ipv6_addr_any(&rs->rs_conn_addr))
124 return -ENOTCONN; 125 return -ENOTCONN;
125 126
126 sin->sin_port = rs->rs_conn_port; 127 if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
127 sin->sin_addr.s_addr = rs->rs_conn_addr; 128 sin = (struct sockaddr_in *)uaddr;
129 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
130 sin->sin_family = AF_INET;
131 sin->sin_port = rs->rs_conn_port;
132 sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
133 uaddr_len = sizeof(*sin);
134 } else {
135 sin6 = (struct sockaddr_in6 *)uaddr;
136 sin6->sin6_family = AF_INET6;
137 sin6->sin6_port = rs->rs_conn_port;
138 sin6->sin6_addr = rs->rs_conn_addr;
139 sin6->sin6_flowinfo = 0;
140 /* scope_id is the same as in the bound address. */
141 sin6->sin6_scope_id = rs->rs_bound_scope_id;
142 uaddr_len = sizeof(*sin6);
143 }
128 } else { 144 } else {
129 sin->sin_port = rs->rs_bound_port; 145 /* If socket is not yet bound and the socket is connected,
130 sin->sin_addr.s_addr = rs->rs_bound_addr; 146 * set the return address family to be the same as the
147 * connected address, but with 0 address value. If it is not
148 * connected, set the family to be AF_UNSPEC (value 0) and
149 * the address size to be that of an IPv4 address.
150 */
151 if (ipv6_addr_any(&rs->rs_bound_addr)) {
152 if (ipv6_addr_any(&rs->rs_conn_addr)) {
153 sin = (struct sockaddr_in *)uaddr;
154 memset(sin, 0, sizeof(*sin));
155 sin->sin_family = AF_UNSPEC;
156 return sizeof(*sin);
157 }
158
159#if IS_ENABLED(CONFIG_IPV6)
160 if (!(ipv6_addr_type(&rs->rs_conn_addr) &
161 IPV6_ADDR_MAPPED)) {
162 sin6 = (struct sockaddr_in6 *)uaddr;
163 memset(sin6, 0, sizeof(*sin6));
164 sin6->sin6_family = AF_INET6;
165 return sizeof(*sin6);
166 }
167#endif
168
169 sin = (struct sockaddr_in *)uaddr;
170 memset(sin, 0, sizeof(*sin));
171 sin->sin_family = AF_INET;
172 return sizeof(*sin);
173 }
174 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
175 sin = (struct sockaddr_in *)uaddr;
176 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
177 sin->sin_family = AF_INET;
178 sin->sin_port = rs->rs_bound_port;
179 sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
180 uaddr_len = sizeof(*sin);
181 } else {
182 sin6 = (struct sockaddr_in6 *)uaddr;
183 sin6->sin6_family = AF_INET6;
184 sin6->sin6_port = rs->rs_bound_port;
185 sin6->sin6_addr = rs->rs_bound_addr;
186 sin6->sin6_flowinfo = 0;
187 sin6->sin6_scope_id = rs->rs_bound_scope_id;
188 uaddr_len = sizeof(*sin6);
189 }
131 } 190 }
132 191
133 sin->sin_family = AF_INET; 192 return uaddr_len;
134
135 return sizeof(*sin);
136} 193}
137 194
138/* 195/*
@@ -203,11 +260,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
203static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, 260static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
204 int len) 261 int len)
205{ 262{
263 struct sockaddr_in6 sin6;
206 struct sockaddr_in sin; 264 struct sockaddr_in sin;
207 int ret = 0; 265 int ret = 0;
208 266
209 /* racing with another thread binding seems ok here */ 267 /* racing with another thread binding seems ok here */
210 if (rs->rs_bound_addr == 0) { 268 if (ipv6_addr_any(&rs->rs_bound_addr)) {
211 ret = -ENOTCONN; /* XXX not a great errno */ 269 ret = -ENOTCONN; /* XXX not a great errno */
212 goto out; 270 goto out;
213 } 271 }
@@ -215,14 +273,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
215 if (len < sizeof(struct sockaddr_in)) { 273 if (len < sizeof(struct sockaddr_in)) {
216 ret = -EINVAL; 274 ret = -EINVAL;
217 goto out; 275 goto out;
276 } else if (len < sizeof(struct sockaddr_in6)) {
277 /* Assume IPv4 */
278 if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
279 ret = -EFAULT;
280 goto out;
281 }
282 ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
283 sin6.sin6_port = sin.sin_port;
284 } else {
285 if (copy_from_user(&sin6, optval,
286 sizeof(struct sockaddr_in6))) {
287 ret = -EFAULT;
288 goto out;
289 }
218 } 290 }
219 291
220 if (copy_from_user(&sin, optval, sizeof(sin))) { 292 rds_send_drop_to(rs, &sin6);
221 ret = -EFAULT;
222 goto out;
223 }
224
225 rds_send_drop_to(rs, &sin);
226out: 293out:
227 return ret; 294 return ret;
228} 295}
@@ -435,31 +502,91 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
435 int addr_len, int flags) 502 int addr_len, int flags)
436{ 503{
437 struct sock *sk = sock->sk; 504 struct sock *sk = sock->sk;
438 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 505 struct sockaddr_in *sin;
439 struct rds_sock *rs = rds_sk_to_rs(sk); 506 struct rds_sock *rs = rds_sk_to_rs(sk);
440 int ret = 0; 507 int ret = 0;
441 508
442 lock_sock(sk); 509 lock_sock(sk);
443 510
444 if (addr_len != sizeof(struct sockaddr_in)) { 511 switch (uaddr->sa_family) {
445 ret = -EINVAL; 512 case AF_INET:
446 goto out; 513 sin = (struct sockaddr_in *)uaddr;
447 } 514 if (addr_len < sizeof(struct sockaddr_in)) {
515 ret = -EINVAL;
516 break;
517 }
518 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
519 ret = -EDESTADDRREQ;
520 break;
521 }
522 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
523 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
524 ret = -EINVAL;
525 break;
526 }
527 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
528 rs->rs_conn_port = sin->sin_port;
529 break;
448 530
449 if (sin->sin_family != AF_INET) { 531#if IS_ENABLED(CONFIG_IPV6)
450 ret = -EAFNOSUPPORT; 532 case AF_INET6: {
451 goto out; 533 struct sockaddr_in6 *sin6;
452 } 534 int addr_type;
453 535
454 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 536 sin6 = (struct sockaddr_in6 *)uaddr;
455 ret = -EDESTADDRREQ; 537 if (addr_len < sizeof(struct sockaddr_in6)) {
456 goto out; 538 ret = -EINVAL;
539 break;
540 }
541 addr_type = ipv6_addr_type(&sin6->sin6_addr);
542 if (!(addr_type & IPV6_ADDR_UNICAST)) {
543 __be32 addr4;
544
545 if (!(addr_type & IPV6_ADDR_MAPPED)) {
546 ret = -EPROTOTYPE;
547 break;
548 }
549
550 /* It is a mapped address. Need to do some sanity
551 * checks.
552 */
553 addr4 = sin6->sin6_addr.s6_addr32[3];
554 if (addr4 == htonl(INADDR_ANY) ||
555 addr4 == htonl(INADDR_BROADCAST) ||
556 IN_MULTICAST(ntohl(addr4))) {
557 ret = -EPROTOTYPE;
558 break;
559 }
560 }
561
562 if (addr_type & IPV6_ADDR_LINKLOCAL) {
563 /* If socket is arleady bound to a link local address,
564 * the peer address must be on the same link.
565 */
566 if (sin6->sin6_scope_id == 0 ||
567 (!ipv6_addr_any(&rs->rs_bound_addr) &&
568 rs->rs_bound_scope_id &&
569 sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
570 ret = -EINVAL;
571 break;
572 }
573 /* Remember the connected address scope ID. It will
574 * be checked against the binding local address when
575 * the socket is bound.
576 */
577 rs->rs_bound_scope_id = sin6->sin6_scope_id;
578 }
579 rs->rs_conn_addr = sin6->sin6_addr;
580 rs->rs_conn_port = sin6->sin6_port;
581 break;
457 } 582 }
583#endif
458 584
459 rs->rs_conn_addr = sin->sin_addr.s_addr; 585 default:
460 rs->rs_conn_port = sin->sin_port; 586 ret = -EAFNOSUPPORT;
587 break;
588 }
461 589
462out:
463 release_sock(sk); 590 release_sock(sk);
464 return ret; 591 return ret;
465} 592}
@@ -578,8 +705,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
578 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 705 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
579 total++; 706 total++;
580 if (total <= len) 707 if (total <= len)
581 rds_inc_info_copy(inc, iter, inc->i_saddr, 708 rds_inc_info_copy(inc, iter,
582 rs->rs_bound_addr, 1); 709 inc->i_saddr.s6_addr32[3],
710 rs->rs_bound_addr_v4,
711 1);
583 } 712 }
584 713
585 read_unlock(&rs->rs_recv_lock); 714 read_unlock(&rs->rs_recv_lock);
@@ -608,8 +737,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
608 list_for_each_entry(rs, &rds_sock_list, rs_item) { 737 list_for_each_entry(rs, &rds_sock_list, rs_item) {
609 sinfo.sndbuf = rds_sk_sndbuf(rs); 738 sinfo.sndbuf = rds_sk_sndbuf(rs);
610 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 739 sinfo.rcvbuf = rds_sk_rcvbuf(rs);
611 sinfo.bound_addr = rs->rs_bound_addr; 740 sinfo.bound_addr = rs->rs_bound_addr_v4;
612 sinfo.connected_addr = rs->rs_conn_addr; 741 sinfo.connected_addr = rs->rs_conn_addr_v4;
613 sinfo.bound_port = rs->rs_bound_port; 742 sinfo.bound_port = rs->rs_bound_port;
614 sinfo.connected_port = rs->rs_conn_port; 743 sinfo.connected_port = rs->rs_conn_port;
615 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 744 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5aa3a64aa4f0..3ab55784b637 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <net/sock.h> 34#include <net/sock.h>
35#include <linux/in.h> 35#include <linux/in.h>
36#include <linux/ipv6.h>
36#include <linux/if_arp.h> 37#include <linux/if_arp.h>
37#include <linux/jhash.h> 38#include <linux/jhash.h>
38#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
@@ -42,42 +43,58 @@ static struct rhashtable bind_hash_table;
42 43
43static const struct rhashtable_params ht_parms = { 44static const struct rhashtable_params ht_parms = {
44 .nelem_hint = 768, 45 .nelem_hint = 768,
45 .key_len = sizeof(u64), 46 .key_len = RDS_BOUND_KEY_LEN,
46 .key_offset = offsetof(struct rds_sock, rs_bound_key), 47 .key_offset = offsetof(struct rds_sock, rs_bound_key),
47 .head_offset = offsetof(struct rds_sock, rs_bound_node), 48 .head_offset = offsetof(struct rds_sock, rs_bound_node),
48 .max_size = 16384, 49 .max_size = 16384,
49 .min_size = 1024, 50 .min_size = 1024,
50}; 51};
51 52
53/* Create a key for the bind hash table manipulation. Port is in network byte
54 * order.
55 */
56static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
57 __be16 port, __u32 scope_id)
58{
59 memcpy(key, addr, sizeof(*addr));
60 key += sizeof(*addr);
61 memcpy(key, &port, sizeof(port));
62 key += sizeof(port);
63 memcpy(key, &scope_id, sizeof(scope_id));
64}
65
52/* 66/*
53 * Return the rds_sock bound at the given local address. 67 * Return the rds_sock bound at the given local address.
54 * 68 *
55 * The rx path can race with rds_release. We notice if rds_release() has 69 * The rx path can race with rds_release. We notice if rds_release() has
56 * marked this socket and don't return a rs ref to the rx path. 70 * marked this socket and don't return a rs ref to the rx path.
57 */ 71 */
58struct rds_sock *rds_find_bound(__be32 addr, __be16 port) 72struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
73 __u32 scope_id)
59{ 74{
60 u64 key = ((u64)addr << 32) | port; 75 u8 key[RDS_BOUND_KEY_LEN];
61 struct rds_sock *rs; 76 struct rds_sock *rs;
62 77
63 rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms); 78 __rds_create_bind_key(key, addr, port, scope_id);
79 rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
64 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) 80 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
65 rds_sock_addref(rs); 81 rds_sock_addref(rs);
66 else 82 else
67 rs = NULL; 83 rs = NULL;
68 84
69 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, 85 rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
70 ntohs(port)); 86 ntohs(port));
71 87
72 return rs; 88 return rs;
73} 89}
74 90
75/* returns -ve errno or +ve port */ 91/* returns -ve errno or +ve port */
76static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) 92static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
93 __be16 *port, __u32 scope_id)
77{ 94{
78 int ret = -EADDRINUSE; 95 int ret = -EADDRINUSE;
79 u16 rover, last; 96 u16 rover, last;
80 u64 key; 97 u8 key[RDS_BOUND_KEY_LEN];
81 98
82 if (*port != 0) { 99 if (*port != 0) {
83 rover = be16_to_cpu(*port); 100 rover = be16_to_cpu(*port);
@@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
95 112
96 if (rover == RDS_FLAG_PROBE_PORT) 113 if (rover == RDS_FLAG_PROBE_PORT)
97 continue; 114 continue;
98 key = ((u64)addr << 32) | cpu_to_be16(rover); 115 __rds_create_bind_key(key, addr, cpu_to_be16(rover),
99 if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) 116 scope_id);
117 if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
100 continue; 118 continue;
101 119
102 rs->rs_bound_key = key; 120 memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
103 rs->rs_bound_addr = addr; 121 rs->rs_bound_addr = *addr;
104 net_get_random_once(&rs->rs_hash_initval, 122 net_get_random_once(&rs->rs_hash_initval,
105 sizeof(rs->rs_hash_initval)); 123 sizeof(rs->rs_hash_initval));
106 rs->rs_bound_port = cpu_to_be16(rover); 124 rs->rs_bound_port = cpu_to_be16(rover);
@@ -109,12 +127,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
109 if (!rhashtable_insert_fast(&bind_hash_table, 127 if (!rhashtable_insert_fast(&bind_hash_table,
110 &rs->rs_bound_node, ht_parms)) { 128 &rs->rs_bound_node, ht_parms)) {
111 *port = rs->rs_bound_port; 129 *port = rs->rs_bound_port;
130 rs->rs_bound_scope_id = scope_id;
112 ret = 0; 131 ret = 0;
113 rdsdebug("rs %p binding to %pI4:%d\n", 132 rdsdebug("rs %p binding to %pI6c:%d\n",
114 rs, &addr, (int)ntohs(*port)); 133 rs, addr, (int)ntohs(*port));
115 break; 134 break;
116 } else { 135 } else {
117 rs->rs_bound_addr = 0; 136 rs->rs_bound_addr = in6addr_any;
118 rds_sock_put(rs); 137 rds_sock_put(rs);
119 ret = -ENOMEM; 138 ret = -ENOMEM;
120 break; 139 break;
@@ -127,44 +146,103 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
127void rds_remove_bound(struct rds_sock *rs) 146void rds_remove_bound(struct rds_sock *rs)
128{ 147{
129 148
130 if (!rs->rs_bound_addr) 149 if (ipv6_addr_any(&rs->rs_bound_addr))
131 return; 150 return;
132 151
133 rdsdebug("rs %p unbinding from %pI4:%d\n", 152 rdsdebug("rs %p unbinding from %pI6c:%d\n",
134 rs, &rs->rs_bound_addr, 153 rs, &rs->rs_bound_addr,
135 ntohs(rs->rs_bound_port)); 154 ntohs(rs->rs_bound_port));
136 155
137 rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); 156 rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
138 rds_sock_put(rs); 157 rds_sock_put(rs);
139 rs->rs_bound_addr = 0; 158 rs->rs_bound_addr = in6addr_any;
140} 159}
141 160
142int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 161int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
143{ 162{
144 struct sock *sk = sock->sk; 163 struct sock *sk = sock->sk;
145 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
146 struct rds_sock *rs = rds_sk_to_rs(sk); 164 struct rds_sock *rs = rds_sk_to_rs(sk);
165 struct in6_addr v6addr, *binding_addr;
147 struct rds_transport *trans; 166 struct rds_transport *trans;
167 __u32 scope_id = 0;
148 int ret = 0; 168 int ret = 0;
169 __be16 port;
170
171 /* We allow an RDS socket to be bound to either IPv4 or IPv6
172 * address.
173 */
174 if (uaddr->sa_family == AF_INET) {
175 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
176
177 if (addr_len < sizeof(struct sockaddr_in) ||
178 sin->sin_addr.s_addr == htonl(INADDR_ANY) ||
179 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
180 IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
181 return -EINVAL;
182 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
183 binding_addr = &v6addr;
184 port = sin->sin_port;
185#if IS_ENABLED(CONFIG_IPV6)
186 } else if (uaddr->sa_family == AF_INET6) {
187 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
188 int addr_type;
189
190 if (addr_len < sizeof(struct sockaddr_in6))
191 return -EINVAL;
192 addr_type = ipv6_addr_type(&sin6->sin6_addr);
193 if (!(addr_type & IPV6_ADDR_UNICAST)) {
194 __be32 addr4;
149 195
196 if (!(addr_type & IPV6_ADDR_MAPPED))
197 return -EINVAL;
198
199 /* It is a mapped address. Need to do some sanity
200 * checks.
201 */
202 addr4 = sin6->sin6_addr.s6_addr32[3];
203 if (addr4 == htonl(INADDR_ANY) ||
204 addr4 == htonl(INADDR_BROADCAST) ||
205 IN_MULTICAST(ntohl(addr4)))
206 return -EINVAL;
207 }
208 /* The scope ID must be specified for link local address. */
209 if (addr_type & IPV6_ADDR_LINKLOCAL) {
210 if (sin6->sin6_scope_id == 0)
211 return -EINVAL;
212 scope_id = sin6->sin6_scope_id;
213 }
214 binding_addr = &sin6->sin6_addr;
215 port = sin6->sin6_port;
216#endif
217 } else {
218 return -EINVAL;
219 }
150 lock_sock(sk); 220 lock_sock(sk);
151 221
152 if (addr_len != sizeof(struct sockaddr_in) || 222 /* RDS socket does not allow re-binding. */
153 sin->sin_family != AF_INET || 223 if (!ipv6_addr_any(&rs->rs_bound_addr)) {
154 rs->rs_bound_addr || 224 ret = -EINVAL;
155 sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 225 goto out;
226 }
227 /* Socket is connected. The binding address should have the same
228 * scope ID as the connected address, except the case when one is
229 * non-link local address (scope_id is 0).
230 */
231 if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id &&
232 rs->rs_bound_scope_id &&
233 scope_id != rs->rs_bound_scope_id) {
156 ret = -EINVAL; 234 ret = -EINVAL;
157 goto out; 235 goto out;
158 } 236 }
159 237
160 ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); 238 ret = rds_add_bound(rs, binding_addr, &port, scope_id);
161 if (ret) 239 if (ret)
162 goto out; 240 goto out;
163 241
164 if (rs->rs_transport) { /* previously bound */ 242 if (rs->rs_transport) { /* previously bound */
165 trans = rs->rs_transport; 243 trans = rs->rs_transport;
166 if (trans->laddr_check(sock_net(sock->sk), 244 if (trans->laddr_check(sock_net(sock->sk),
167 sin->sin_addr.s_addr) != 0) { 245 binding_addr, scope_id) != 0) {
168 ret = -ENOPROTOOPT; 246 ret = -ENOPROTOOPT;
169 rds_remove_bound(rs); 247 rds_remove_bound(rs);
170 } else { 248 } else {
@@ -172,13 +250,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
172 } 250 }
173 goto out; 251 goto out;
174 } 252 }
175 trans = rds_trans_get_preferred(sock_net(sock->sk), 253 trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
176 sin->sin_addr.s_addr); 254 scope_id);
177 if (!trans) { 255 if (!trans) {
178 ret = -EADDRNOTAVAIL; 256 ret = -EADDRNOTAVAIL;
179 rds_remove_bound(rs); 257 rds_remove_bound(rs);
180 pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", 258 pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
181 __func__, &sin->sin_addr.s_addr); 259 __func__, binding_addr);
182 goto out; 260 goto out;
183 } 261 }
184 262
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 63da9d2f142d..ccdff09a79c8 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2007 Oracle. All rights reserved. 2 * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock);
101static DEFINE_SPINLOCK(rds_cong_lock); 101static DEFINE_SPINLOCK(rds_cong_lock);
102static struct rb_root rds_cong_tree = RB_ROOT; 102static struct rb_root rds_cong_tree = RB_ROOT;
103 103
104static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, 104static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
105 struct rds_cong_map *insert) 105 struct rds_cong_map *insert)
106{ 106{
107 struct rb_node **p = &rds_cong_tree.rb_node; 107 struct rb_node **p = &rds_cong_tree.rb_node;
@@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
109 struct rds_cong_map *map; 109 struct rds_cong_map *map;
110 110
111 while (*p) { 111 while (*p) {
112 int diff;
113
112 parent = *p; 114 parent = *p;
113 map = rb_entry(parent, struct rds_cong_map, m_rb_node); 115 map = rb_entry(parent, struct rds_cong_map, m_rb_node);
114 116
115 if (addr < map->m_addr) 117 diff = rds_addr_cmp(addr, &map->m_addr);
118 if (diff < 0)
116 p = &(*p)->rb_left; 119 p = &(*p)->rb_left;
117 else if (addr > map->m_addr) 120 else if (diff > 0)
118 p = &(*p)->rb_right; 121 p = &(*p)->rb_right;
119 else 122 else
120 return map; 123 return map;
@@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
132 * these bitmaps in the process getting pointers to them. The bitmaps are only 135 * these bitmaps in the process getting pointers to them. The bitmaps are only
133 * ever freed as the module is removed after all connections have been freed. 136 * ever freed as the module is removed after all connections have been freed.
134 */ 137 */
135static struct rds_cong_map *rds_cong_from_addr(__be32 addr) 138static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
136{ 139{
137 struct rds_cong_map *map; 140 struct rds_cong_map *map;
138 struct rds_cong_map *ret = NULL; 141 struct rds_cong_map *ret = NULL;
@@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
144 if (!map) 147 if (!map)
145 return NULL; 148 return NULL;
146 149
147 map->m_addr = addr; 150 map->m_addr = *addr;
148 init_waitqueue_head(&map->m_waitq); 151 init_waitqueue_head(&map->m_waitq);
149 INIT_LIST_HEAD(&map->m_conn_list); 152 INIT_LIST_HEAD(&map->m_conn_list);
150 153
@@ -171,7 +174,7 @@ out:
171 kfree(map); 174 kfree(map);
172 } 175 }
173 176
174 rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); 177 rdsdebug("map %p for addr %pI6c\n", ret, addr);
175 178
176 return ret; 179 return ret;
177} 180}
@@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn)
202 205
203int rds_cong_get_maps(struct rds_connection *conn) 206int rds_cong_get_maps(struct rds_connection *conn)
204{ 207{
205 conn->c_lcong = rds_cong_from_addr(conn->c_laddr); 208 conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
206 conn->c_fcong = rds_cong_from_addr(conn->c_faddr); 209 conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
207 210
208 if (!(conn->c_lcong && conn->c_fcong)) 211 if (!(conn->c_lcong && conn->c_fcong))
209 return -ENOMEM; 212 return -ENOMEM;
@@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs)
353 356
354 /* update congestion map for now-closed port */ 357 /* update congestion map for now-closed port */
355 spin_lock_irqsave(&rds_cong_lock, flags); 358 spin_lock_irqsave(&rds_cong_lock, flags);
356 map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); 359 map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
357 spin_unlock_irqrestore(&rds_cong_lock, flags); 360 spin_unlock_irqrestore(&rds_cong_lock, flags);
358 361
359 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { 362 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index cfb05953b0e5..3bd2f4a5a30d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,9 @@
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/export.h> 36#include <linux/export.h>
37#include <net/inet_hashtables.h> 37#include <net/ipv6.h>
38#include <net/inet6_hashtables.h>
39#include <net/addrconf.h>
38 40
39#include "rds.h" 41#include "rds.h"
40#include "loop.h" 42#include "loop.h"
@@ -49,18 +51,25 @@ static unsigned long rds_conn_count;
49static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; 51static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
50static struct kmem_cache *rds_conn_slab; 52static struct kmem_cache *rds_conn_slab;
51 53
52static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) 54static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
55 const struct in6_addr *faddr)
53{ 56{
57 static u32 rds6_hash_secret __read_mostly;
54 static u32 rds_hash_secret __read_mostly; 58 static u32 rds_hash_secret __read_mostly;
55 59
56 unsigned long hash; 60 u32 lhash, fhash, hash;
57 61
58 net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); 62 net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
63 net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
64
65 lhash = (__force u32)laddr->s6_addr32[3];
66#if IS_ENABLED(CONFIG_IPV6)
67 fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
68#else
69 fhash = (__force u32)faddr->s6_addr32[3];
70#endif
71 hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
59 72
60 /* Pass NULL, don't need struct net for hash */
61 hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
62 be32_to_cpu(faddr), 0,
63 rds_hash_secret);
64 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; 73 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
65} 74}
66 75
@@ -72,20 +81,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
72/* rcu read lock must be held or the connection spinlock */ 81/* rcu read lock must be held or the connection spinlock */
73static struct rds_connection *rds_conn_lookup(struct net *net, 82static struct rds_connection *rds_conn_lookup(struct net *net,
74 struct hlist_head *head, 83 struct hlist_head *head,
75 __be32 laddr, __be32 faddr, 84 const struct in6_addr *laddr,
76 struct rds_transport *trans) 85 const struct in6_addr *faddr,
86 struct rds_transport *trans,
87 int dev_if)
77{ 88{
78 struct rds_connection *conn, *ret = NULL; 89 struct rds_connection *conn, *ret = NULL;
79 90
80 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 91 hlist_for_each_entry_rcu(conn, head, c_hash_node) {
81 if (conn->c_faddr == faddr && conn->c_laddr == laddr && 92 if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
82 conn->c_trans == trans && net == rds_conn_net(conn)) { 93 ipv6_addr_equal(&conn->c_laddr, laddr) &&
94 conn->c_trans == trans &&
95 net == rds_conn_net(conn) &&
96 conn->c_dev_if == dev_if) {
83 ret = conn; 97 ret = conn;
84 break; 98 break;
85 } 99 }
86 } 100 }
87 rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, 101 rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
88 &laddr, &faddr); 102 laddr, faddr);
89 return ret; 103 return ret;
90} 104}
91 105
@@ -99,8 +113,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp)
99{ 113{
100 struct rds_connection *conn = cp->cp_conn; 114 struct rds_connection *conn = cp->cp_conn;
101 115
102 rdsdebug("connection %pI4 to %pI4 reset\n", 116 rdsdebug("connection %pI6c to %pI6c reset\n",
103 &conn->c_laddr, &conn->c_faddr); 117 &conn->c_laddr, &conn->c_faddr);
104 118
105 rds_stats_inc(s_conn_reset); 119 rds_stats_inc(s_conn_reset);
106 rds_send_path_reset(cp); 120 rds_send_path_reset(cp);
@@ -142,9 +156,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
142 * are torn down as the module is removed, if ever. 156 * are torn down as the module is removed, if ever.
143 */ 157 */
144static struct rds_connection *__rds_conn_create(struct net *net, 158static struct rds_connection *__rds_conn_create(struct net *net,
145 __be32 laddr, __be32 faddr, 159 const struct in6_addr *laddr,
146 struct rds_transport *trans, gfp_t gfp, 160 const struct in6_addr *faddr,
147 int is_outgoing) 161 struct rds_transport *trans,
162 gfp_t gfp,
163 int is_outgoing,
164 int dev_if)
148{ 165{
149 struct rds_connection *conn, *parent = NULL; 166 struct rds_connection *conn, *parent = NULL;
150 struct hlist_head *head = rds_conn_bucket(laddr, faddr); 167 struct hlist_head *head = rds_conn_bucket(laddr, faddr);
@@ -154,9 +171,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
154 int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); 171 int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
155 172
156 rcu_read_lock(); 173 rcu_read_lock();
157 conn = rds_conn_lookup(net, head, laddr, faddr, trans); 174 conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
158 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && 175 if (conn &&
159 laddr == faddr && !is_outgoing) { 176 conn->c_loopback &&
177 conn->c_trans != &rds_loop_transport &&
178 ipv6_addr_equal(laddr, faddr) &&
179 !is_outgoing) {
160 /* This is a looped back IB connection, and we're 180 /* This is a looped back IB connection, and we're
161 * called by the code handling the incoming connect. 181 * called by the code handling the incoming connect.
162 * We need a second connection object into which we 182 * We need a second connection object into which we
@@ -181,8 +201,22 @@ static struct rds_connection *__rds_conn_create(struct net *net,
181 } 201 }
182 202
183 INIT_HLIST_NODE(&conn->c_hash_node); 203 INIT_HLIST_NODE(&conn->c_hash_node);
184 conn->c_laddr = laddr; 204 conn->c_laddr = *laddr;
185 conn->c_faddr = faddr; 205 conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
206 conn->c_faddr = *faddr;
207 conn->c_dev_if = dev_if;
208
209#if IS_ENABLED(CONFIG_IPV6)
210 /* If the local address is link local, set c_bound_if to be the
211 * index used for this connection. Otherwise, set it to 0 as
212 * the socket is not bound to an interface. c_bound_if is used
213 * to look up a socket when a packet is received
214 */
215 if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL)
216 conn->c_bound_if = dev_if;
217 else
218#endif
219 conn->c_bound_if = 0;
186 220
187 rds_conn_net_set(conn, net); 221 rds_conn_net_set(conn, net);
188 222
@@ -199,7 +233,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
199 * can bind to the destination address then we'd rather the messages 233 * can bind to the destination address then we'd rather the messages
200 * flow through loopback rather than either transport. 234 * flow through loopback rather than either transport.
201 */ 235 */
202 loop_trans = rds_trans_get_preferred(net, faddr); 236 loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
203 if (loop_trans) { 237 if (loop_trans) {
204 rds_trans_put(loop_trans); 238 rds_trans_put(loop_trans);
205 conn->c_loopback = 1; 239 conn->c_loopback = 1;
@@ -233,10 +267,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
233 goto out; 267 goto out;
234 } 268 }
235 269
236 rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", 270 rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
237 conn, &laddr, &faddr, 271 conn, laddr, faddr,
238 strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : 272 strnlen(trans->t_name, sizeof(trans->t_name)) ?
239 "[unknown]", is_outgoing ? "(outgoing)" : ""); 273 trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
240 274
241 /* 275 /*
242 * Since we ran without holding the conn lock, someone could 276 * Since we ran without holding the conn lock, someone could
@@ -262,7 +296,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
262 /* Creating normal conn */ 296 /* Creating normal conn */
263 struct rds_connection *found; 297 struct rds_connection *found;
264 298
265 found = rds_conn_lookup(net, head, laddr, faddr, trans); 299 found = rds_conn_lookup(net, head, laddr, faddr, trans,
300 dev_if);
266 if (found) { 301 if (found) {
267 struct rds_conn_path *cp; 302 struct rds_conn_path *cp;
268 int i; 303 int i;
@@ -295,18 +330,22 @@ out:
295} 330}
296 331
297struct rds_connection *rds_conn_create(struct net *net, 332struct rds_connection *rds_conn_create(struct net *net,
298 __be32 laddr, __be32 faddr, 333 const struct in6_addr *laddr,
299 struct rds_transport *trans, gfp_t gfp) 334 const struct in6_addr *faddr,
335 struct rds_transport *trans, gfp_t gfp,
336 int dev_if)
300{ 337{
301 return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); 338 return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
302} 339}
303EXPORT_SYMBOL_GPL(rds_conn_create); 340EXPORT_SYMBOL_GPL(rds_conn_create);
304 341
305struct rds_connection *rds_conn_create_outgoing(struct net *net, 342struct rds_connection *rds_conn_create_outgoing(struct net *net,
306 __be32 laddr, __be32 faddr, 343 const struct in6_addr *laddr,
307 struct rds_transport *trans, gfp_t gfp) 344 const struct in6_addr *faddr,
345 struct rds_transport *trans,
346 gfp_t gfp, int dev_if)
308{ 347{
309 return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); 348 return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
310} 349}
311EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); 350EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
312 351
@@ -464,10 +503,23 @@ void rds_conn_destroy(struct rds_connection *conn)
464} 503}
465EXPORT_SYMBOL_GPL(rds_conn_destroy); 504EXPORT_SYMBOL_GPL(rds_conn_destroy);
466 505
467static void rds_conn_message_info(struct socket *sock, unsigned int len, 506static void __rds_inc_msg_cp(struct rds_incoming *inc,
468 struct rds_info_iterator *iter, 507 struct rds_info_iterator *iter,
469 struct rds_info_lengths *lens, 508 void *saddr, void *daddr, int flip, bool isv6)
470 int want_send) 509{
510#if IS_ENABLED(CONFIG_IPV6)
511 if (isv6)
512 rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
513 else
514#endif
515 rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
516 *(__be32 *)daddr, flip);
517}
518
519static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
520 struct rds_info_iterator *iter,
521 struct rds_info_lengths *lens,
522 int want_send, bool isv6)
471{ 523{
472 struct hlist_head *head; 524 struct hlist_head *head;
473 struct list_head *list; 525 struct list_head *list;
@@ -478,7 +530,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
478 size_t i; 530 size_t i;
479 int j; 531 int j;
480 532
481 len /= sizeof(struct rds_info_message); 533 if (isv6)
534 len /= sizeof(struct rds6_info_message);
535 else
536 len /= sizeof(struct rds_info_message);
482 537
483 rcu_read_lock(); 538 rcu_read_lock();
484 539
@@ -488,6 +543,9 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
488 struct rds_conn_path *cp; 543 struct rds_conn_path *cp;
489 int npaths; 544 int npaths;
490 545
546 if (!isv6 && conn->c_isv6)
547 continue;
548
491 npaths = (conn->c_trans->t_mp_capable ? 549 npaths = (conn->c_trans->t_mp_capable ?
492 RDS_MPATH_WORKERS : 1); 550 RDS_MPATH_WORKERS : 1);
493 551
@@ -504,11 +562,11 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
504 list_for_each_entry(rm, list, m_conn_item) { 562 list_for_each_entry(rm, list, m_conn_item) {
505 total++; 563 total++;
506 if (total <= len) 564 if (total <= len)
507 rds_inc_info_copy(&rm->m_inc, 565 __rds_inc_msg_cp(&rm->m_inc,
508 iter, 566 iter,
509 conn->c_laddr, 567 &conn->c_laddr,
510 conn->c_faddr, 568 &conn->c_faddr,
511 0); 569 0, isv6);
512 } 570 }
513 571
514 spin_unlock_irqrestore(&cp->cp_lock, flags); 572 spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -518,9 +576,30 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
518 rcu_read_unlock(); 576 rcu_read_unlock();
519 577
520 lens->nr = total; 578 lens->nr = total;
521 lens->each = sizeof(struct rds_info_message); 579 if (isv6)
580 lens->each = sizeof(struct rds6_info_message);
581 else
582 lens->each = sizeof(struct rds_info_message);
522} 583}
523 584
585static void rds_conn_message_info(struct socket *sock, unsigned int len,
586 struct rds_info_iterator *iter,
587 struct rds_info_lengths *lens,
588 int want_send)
589{
590 rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
591}
592
593#if IS_ENABLED(CONFIG_IPV6)
594static void rds6_conn_message_info(struct socket *sock, unsigned int len,
595 struct rds_info_iterator *iter,
596 struct rds_info_lengths *lens,
597 int want_send)
598{
599 rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
600}
601#endif
602
524static void rds_conn_message_info_send(struct socket *sock, unsigned int len, 603static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
525 struct rds_info_iterator *iter, 604 struct rds_info_iterator *iter,
526 struct rds_info_lengths *lens) 605 struct rds_info_lengths *lens)
@@ -528,6 +607,15 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
528 rds_conn_message_info(sock, len, iter, lens, 1); 607 rds_conn_message_info(sock, len, iter, lens, 1);
529} 608}
530 609
610#if IS_ENABLED(CONFIG_IPV6)
611static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
612 struct rds_info_iterator *iter,
613 struct rds_info_lengths *lens)
614{
615 rds6_conn_message_info(sock, len, iter, lens, 1);
616}
617#endif
618
531static void rds_conn_message_info_retrans(struct socket *sock, 619static void rds_conn_message_info_retrans(struct socket *sock,
532 unsigned int len, 620 unsigned int len,
533 struct rds_info_iterator *iter, 621 struct rds_info_iterator *iter,
@@ -536,6 +624,16 @@ static void rds_conn_message_info_retrans(struct socket *sock,
536 rds_conn_message_info(sock, len, iter, lens, 0); 624 rds_conn_message_info(sock, len, iter, lens, 0);
537} 625}
538 626
627#if IS_ENABLED(CONFIG_IPV6)
628static void rds6_conn_message_info_retrans(struct socket *sock,
629 unsigned int len,
630 struct rds_info_iterator *iter,
631 struct rds_info_lengths *lens)
632{
633 rds6_conn_message_info(sock, len, iter, lens, 0);
634}
635#endif
636
539void rds_for_each_conn_info(struct socket *sock, unsigned int len, 637void rds_for_each_conn_info(struct socket *sock, unsigned int len,
540 struct rds_info_iterator *iter, 638 struct rds_info_iterator *iter,
541 struct rds_info_lengths *lens, 639 struct rds_info_lengths *lens,
@@ -584,7 +682,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
584 struct hlist_head *head; 682 struct hlist_head *head;
585 struct rds_connection *conn; 683 struct rds_connection *conn;
586 size_t i; 684 size_t i;
587 int j;
588 685
589 rcu_read_lock(); 686 rcu_read_lock();
590 687
@@ -595,17 +692,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
595 i++, head++) { 692 i++, head++) {
596 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 693 hlist_for_each_entry_rcu(conn, head, c_hash_node) {
597 struct rds_conn_path *cp; 694 struct rds_conn_path *cp;
598 int npaths;
599 695
600 npaths = (conn->c_trans->t_mp_capable ? 696 /* XXX We only copy the information from the first
601 RDS_MPATH_WORKERS : 1); 697 * path for now. The problem is that if there are
602 for (j = 0; j < npaths; j++) { 698 * more than one underlying paths, we cannot report
603 cp = &conn->c_path[j]; 699 * information of all of them using the existing
700 * API. For example, there is only one next_tx_seq,
701 * which path's next_tx_seq should we report? It is
702 * a bug in the design of MPRDS.
703 */
704 cp = conn->c_path;
604 705
605 /* XXX no cp_lock usage.. */ 706 /* XXX no cp_lock usage.. */
606 if (!visitor(cp, buffer)) 707 if (!visitor(cp, buffer))
607 continue; 708 continue;
608 }
609 709
610 /* We copy as much as we can fit in the buffer, 710 /* We copy as much as we can fit in the buffer,
611 * but we count all items so that the caller 711 * but we count all items so that the caller
@@ -624,12 +724,16 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
624static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) 724static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
625{ 725{
626 struct rds_info_connection *cinfo = buffer; 726 struct rds_info_connection *cinfo = buffer;
727 struct rds_connection *conn = cp->cp_conn;
728
729 if (conn->c_isv6)
730 return 0;
627 731
628 cinfo->next_tx_seq = cp->cp_next_tx_seq; 732 cinfo->next_tx_seq = cp->cp_next_tx_seq;
629 cinfo->next_rx_seq = cp->cp_next_rx_seq; 733 cinfo->next_rx_seq = cp->cp_next_rx_seq;
630 cinfo->laddr = cp->cp_conn->c_laddr; 734 cinfo->laddr = conn->c_laddr.s6_addr32[3];
631 cinfo->faddr = cp->cp_conn->c_faddr; 735 cinfo->faddr = conn->c_faddr.s6_addr32[3];
632 strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, 736 strncpy(cinfo->transport, conn->c_trans->t_name,
633 sizeof(cinfo->transport)); 737 sizeof(cinfo->transport));
634 cinfo->flags = 0; 738 cinfo->flags = 0;
635 739
@@ -645,6 +749,36 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
645 return 1; 749 return 1;
646} 750}
647 751
752#if IS_ENABLED(CONFIG_IPV6)
753static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
754{
755 struct rds6_info_connection *cinfo6 = buffer;
756 struct rds_connection *conn = cp->cp_conn;
757
758 cinfo6->next_tx_seq = cp->cp_next_tx_seq;
759 cinfo6->next_rx_seq = cp->cp_next_rx_seq;
760 cinfo6->laddr = conn->c_laddr;
761 cinfo6->faddr = conn->c_faddr;
762 strncpy(cinfo6->transport, conn->c_trans->t_name,
763 sizeof(cinfo6->transport));
764 cinfo6->flags = 0;
765
766 rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
767 SENDING);
768 /* XXX Future: return the state rather than these funky bits */
769 rds_conn_info_set(cinfo6->flags,
770 atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
771 CONNECTING);
772 rds_conn_info_set(cinfo6->flags,
773 atomic_read(&cp->cp_state) == RDS_CONN_UP,
774 CONNECTED);
775 /* Just return 1 as there is no error case. This is a helper function
776 * for rds_walk_conn_path_info() and it wants a return value.
777 */
778 return 1;
779}
780#endif
781
648static void rds_conn_info(struct socket *sock, unsigned int len, 782static void rds_conn_info(struct socket *sock, unsigned int len,
649 struct rds_info_iterator *iter, 783 struct rds_info_iterator *iter,
650 struct rds_info_lengths *lens) 784 struct rds_info_lengths *lens)
@@ -657,6 +791,20 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
657 sizeof(struct rds_info_connection)); 791 sizeof(struct rds_info_connection));
658} 792}
659 793
794#if IS_ENABLED(CONFIG_IPV6)
795static void rds6_conn_info(struct socket *sock, unsigned int len,
796 struct rds_info_iterator *iter,
797 struct rds_info_lengths *lens)
798{
799 u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
800
801 rds_walk_conn_path_info(sock, len, iter, lens,
802 rds6_conn_info_visitor,
803 buffer,
804 sizeof(struct rds6_info_connection));
805}
806#endif
807
660int rds_conn_init(void) 808int rds_conn_init(void)
661{ 809{
662 int ret; 810 int ret;
@@ -678,7 +826,13 @@ int rds_conn_init(void)
678 rds_conn_message_info_send); 826 rds_conn_message_info_send);
679 rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, 827 rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
680 rds_conn_message_info_retrans); 828 rds_conn_message_info_retrans);
681 829#if IS_ENABLED(CONFIG_IPV6)
830 rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
831 rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
832 rds6_conn_message_info_send);
833 rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
834 rds6_conn_message_info_retrans);
835#endif
682 return 0; 836 return 0;
683} 837}
684 838
@@ -696,6 +850,13 @@ void rds_conn_exit(void)
696 rds_conn_message_info_send); 850 rds_conn_message_info_send);
697 rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, 851 rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
698 rds_conn_message_info_retrans); 852 rds_conn_message_info_retrans);
853#if IS_ENABLED(CONFIG_IPV6)
854 rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
855 rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
856 rds6_conn_message_info_send);
857 rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
858 rds6_conn_message_info_retrans);
859#endif
699} 860}
700 861
701/* 862/*
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 683b55d4e2b0..c1d97640c0be 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/module.h> 41#include <linux/module.h>
42#include <net/addrconf.h>
42 43
43#include "rds_single_path.h" 44#include "rds_single_path.h"
44#include "rds.h" 45#include "rds.h"
@@ -295,9 +296,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
295 /* We will only ever look at IB transports */ 296 /* We will only ever look at IB transports */
296 if (conn->c_trans != &rds_ib_transport) 297 if (conn->c_trans != &rds_ib_transport)
297 return 0; 298 return 0;
299 if (conn->c_isv6)
300 return 0;
298 301
299 iinfo->src_addr = conn->c_laddr; 302 iinfo->src_addr = conn->c_laddr.s6_addr32[3];
300 iinfo->dst_addr = conn->c_faddr; 303 iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
301 304
302 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); 305 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
303 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); 306 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -318,6 +321,45 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
318 return 1; 321 return 1;
319} 322}
320 323
324#if IS_ENABLED(CONFIG_IPV6)
325/* IPv6 version of rds_ib_conn_info_visitor(). */
326static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
327 void *buffer)
328{
329 struct rds6_info_rdma_connection *iinfo6 = buffer;
330 struct rds_ib_connection *ic;
331
332 /* We will only ever look at IB transports */
333 if (conn->c_trans != &rds_ib_transport)
334 return 0;
335
336 iinfo6->src_addr = conn->c_laddr;
337 iinfo6->dst_addr = conn->c_faddr;
338
339 memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
340 memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
341
342 if (rds_conn_state(conn) == RDS_CONN_UP) {
343 struct rds_ib_device *rds_ibdev;
344 struct rdma_dev_addr *dev_addr;
345
346 ic = conn->c_transport_data;
347 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
348 rdma_addr_get_sgid(dev_addr,
349 (union ib_gid *)&iinfo6->src_gid);
350 rdma_addr_get_dgid(dev_addr,
351 (union ib_gid *)&iinfo6->dst_gid);
352
353 rds_ibdev = ic->rds_ibdev;
354 iinfo6->max_send_wr = ic->i_send_ring.w_nr;
355 iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
356 iinfo6->max_send_sge = rds_ibdev->max_sge;
357 rds6_ib_get_mr_info(rds_ibdev, iinfo6);
358 }
359 return 1;
360}
361#endif
362
321static void rds_ib_ic_info(struct socket *sock, unsigned int len, 363static void rds_ib_ic_info(struct socket *sock, unsigned int len,
322 struct rds_info_iterator *iter, 364 struct rds_info_iterator *iter,
323 struct rds_info_lengths *lens) 365 struct rds_info_lengths *lens)
@@ -330,6 +372,20 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
330 sizeof(struct rds_info_rdma_connection)); 372 sizeof(struct rds_info_rdma_connection));
331} 373}
332 374
375#if IS_ENABLED(CONFIG_IPV6)
376/* IPv6 version of rds_ib_ic_info(). */
377static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
378 struct rds_info_iterator *iter,
379 struct rds_info_lengths *lens)
380{
381 u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
382
383 rds_for_each_conn_info(sock, len, iter, lens,
384 rds6_ib_conn_info_visitor,
385 buffer,
386 sizeof(struct rds6_info_rdma_connection));
387}
388#endif
333 389
334/* 390/*
335 * Early RDS/IB was built to only bind to an address if there is an IPoIB 391 * Early RDS/IB was built to only bind to an address if there is an IPoIB
@@ -341,12 +397,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
341 * allowed to influence which paths have priority. We could call userspace 397 * allowed to influence which paths have priority. We could call userspace
342 * asserting this policy "routing". 398 * asserting this policy "routing".
343 */ 399 */
344static int rds_ib_laddr_check(struct net *net, __be32 addr) 400static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
401 __u32 scope_id)
345{ 402{
346 int ret; 403 int ret;
347 struct rdma_cm_id *cm_id; 404 struct rdma_cm_id *cm_id;
405#if IS_ENABLED(CONFIG_IPV6)
406 struct sockaddr_in6 sin6;
407#endif
348 struct sockaddr_in sin; 408 struct sockaddr_in sin;
409 struct sockaddr *sa;
410 bool isv4;
349 411
412 isv4 = ipv6_addr_v4mapped(addr);
350 /* Create a CMA ID and try to bind it. This catches both 413 /* Create a CMA ID and try to bind it. This catches both
351 * IB and iWARP capable NICs. 414 * IB and iWARP capable NICs.
352 */ 415 */
@@ -355,22 +418,66 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
355 if (IS_ERR(cm_id)) 418 if (IS_ERR(cm_id))
356 return PTR_ERR(cm_id); 419 return PTR_ERR(cm_id);
357 420
358 memset(&sin, 0, sizeof(sin)); 421 if (isv4) {
359 sin.sin_family = AF_INET; 422 memset(&sin, 0, sizeof(sin));
360 sin.sin_addr.s_addr = addr; 423 sin.sin_family = AF_INET;
424 sin.sin_addr.s_addr = addr->s6_addr32[3];
425 sa = (struct sockaddr *)&sin;
426 } else {
427#if IS_ENABLED(CONFIG_IPV6)
428 memset(&sin6, 0, sizeof(sin6));
429 sin6.sin6_family = AF_INET6;
430 sin6.sin6_addr = *addr;
431 sin6.sin6_scope_id = scope_id;
432 sa = (struct sockaddr *)&sin6;
433
434 /* XXX Do a special IPv6 link local address check here. The
435 * reason is that rdma_bind_addr() always succeeds with IPv6
436 * link local address regardless it is indeed configured in a
437 * system.
438 */
439 if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
440 struct net_device *dev;
441
442 if (scope_id == 0) {
443 ret = -EADDRNOTAVAIL;
444 goto out;
445 }
446
447 /* Use init_net for now as RDS is not network
448 * name space aware.
449 */
450 dev = dev_get_by_index(&init_net, scope_id);
451 if (!dev) {
452 ret = -EADDRNOTAVAIL;
453 goto out;
454 }
455 if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
456 dev_put(dev);
457 ret = -EADDRNOTAVAIL;
458 goto out;
459 }
460 dev_put(dev);
461 }
462#else
463 ret = -EADDRNOTAVAIL;
464 goto out;
465#endif
466 }
361 467
362 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 468 /* rdma_bind_addr will only succeed for IB & iWARP devices */
363 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 469 ret = rdma_bind_addr(cm_id, sa);
364 /* due to this, we will claim to support iWARP devices unless we 470 /* due to this, we will claim to support iWARP devices unless we
365 check node_type. */ 471 check node_type. */
366 if (ret || !cm_id->device || 472 if (ret || !cm_id->device ||
367 cm_id->device->node_type != RDMA_NODE_IB_CA) 473 cm_id->device->node_type != RDMA_NODE_IB_CA)
368 ret = -EADDRNOTAVAIL; 474 ret = -EADDRNOTAVAIL;
369 475
370 rdsdebug("addr %pI4 ret %d node type %d\n", 476 rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
371 &addr, ret, 477 addr, scope_id, ret,
372 cm_id->device ? cm_id->device->node_type : -1); 478 cm_id->device ? cm_id->device->node_type : -1);
373 479
480out:
374 rdma_destroy_id(cm_id); 481 rdma_destroy_id(cm_id);
375 482
376 return ret; 483 return ret;
@@ -401,6 +508,9 @@ void rds_ib_exit(void)
401 rds_ib_set_unloading(); 508 rds_ib_set_unloading();
402 synchronize_rcu(); 509 synchronize_rcu();
403 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 510 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
511#if IS_ENABLED(CONFIG_IPV6)
512 rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
513#endif
404 rds_ib_unregister_client(); 514 rds_ib_unregister_client();
405 rds_ib_destroy_nodev_conns(); 515 rds_ib_destroy_nodev_conns();
406 rds_ib_sysctl_exit(); 516 rds_ib_sysctl_exit();
@@ -462,6 +572,9 @@ int rds_ib_init(void)
462 rds_trans_register(&rds_ib_transport); 572 rds_trans_register(&rds_ib_transport);
463 573
464 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 574 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
575#if IS_ENABLED(CONFIG_IPV6)
576 rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
577#endif
465 578
466 goto out; 579 goto out;
467 580
@@ -476,4 +589,3 @@ out:
476} 589}
477 590
478MODULE_LICENSE("GPL"); 591MODULE_LICENSE("GPL");
479
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a6f4d7d68e95..73427ff439f9 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -57,16 +57,44 @@ struct rds_ib_refill_cache {
57 struct list_head *ready; 57 struct list_head *ready;
58}; 58};
59 59
60/* This is the common structure for the IB private data exchange in setting up
61 * an RDS connection. The exchange is different for IPv4 and IPv6 connections.
62 * The reason is that the address size is different and the addresses
63 * exchanged are in the beginning of the structure. Hence it is not possible
64 * for interoperability if same structure is used.
65 */
66struct rds_ib_conn_priv_cmn {
67 u8 ricpc_protocol_major;
68 u8 ricpc_protocol_minor;
69 __be16 ricpc_protocol_minor_mask; /* bitmask */
70 __be32 ricpc_reserved1;
71 __be64 ricpc_ack_seq;
72 __be32 ricpc_credit; /* non-zero enables flow ctl */
73};
74
60struct rds_ib_connect_private { 75struct rds_ib_connect_private {
61 /* Add new fields at the end, and don't permute existing fields. */ 76 /* Add new fields at the end, and don't permute existing fields. */
62 __be32 dp_saddr; 77 __be32 dp_saddr;
63 __be32 dp_daddr; 78 __be32 dp_daddr;
64 u8 dp_protocol_major; 79 struct rds_ib_conn_priv_cmn dp_cmn;
65 u8 dp_protocol_minor; 80};
66 __be16 dp_protocol_minor_mask; /* bitmask */ 81
67 __be32 dp_reserved1; 82struct rds6_ib_connect_private {
68 __be64 dp_ack_seq; 83 /* Add new fields at the end, and don't permute existing fields. */
69 __be32 dp_credit; /* non-zero enables flow ctl */ 84 struct in6_addr dp_saddr;
85 struct in6_addr dp_daddr;
86 struct rds_ib_conn_priv_cmn dp_cmn;
87};
88
89#define dp_protocol_major dp_cmn.ricpc_protocol_major
90#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
91#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
92#define dp_ack_seq dp_cmn.ricpc_ack_seq
93#define dp_credit dp_cmn.ricpc_credit
94
95union rds_ib_conn_priv {
96 struct rds_ib_connect_private ricp_v4;
97 struct rds6_ib_connect_private ricp_v6;
70}; 98};
71 99
72struct rds_ib_send_work { 100struct rds_ib_send_work {
@@ -351,8 +379,8 @@ void rds_ib_listen_stop(void);
351__printf(2, 3) 379__printf(2, 3)
352void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); 380void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
353int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 381int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
354 struct rdma_cm_event *event); 382 struct rdma_cm_event *event, bool isv6);
355int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 383int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
356void rds_ib_cm_connect_complete(struct rds_connection *conn, 384void rds_ib_cm_connect_complete(struct rds_connection *conn,
357 struct rdma_cm_event *event); 385 struct rdma_cm_event *event);
358 386
@@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
361 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) 389 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
362 390
363/* ib_rdma.c */ 391/* ib_rdma.c */
364int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); 392int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
393 struct in6_addr *ipaddr);
365void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 394void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
366void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 395void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
367void rds_ib_destroy_nodev_conns(void); 396void rds_ib_destroy_nodev_conns(void);
@@ -371,7 +400,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
371int rds_ib_recv_init(void); 400int rds_ib_recv_init(void);
372void rds_ib_recv_exit(void); 401void rds_ib_recv_exit(void);
373int rds_ib_recv_path(struct rds_conn_path *conn); 402int rds_ib_recv_path(struct rds_conn_path *conn);
374int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); 403int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp);
375void rds_ib_recv_free_caches(struct rds_ib_connection *ic); 404void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
376void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); 405void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
377void rds_ib_inc_free(struct rds_incoming *inc); 406void rds_ib_inc_free(struct rds_incoming *inc);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f1684ae6abfd..bfbb31f0c7fd 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
35#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/vmalloc.h> 36#include <linux/vmalloc.h>
37#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
38#include <net/addrconf.h>
38 39
39#include "rds_single_path.h" 40#include "rds_single_path.h"
40#include "rds.h" 41#include "rds.h"
@@ -95,25 +96,45 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
95 */ 96 */
96void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) 97void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
97{ 98{
98 const struct rds_ib_connect_private *dp = NULL;
99 struct rds_ib_connection *ic = conn->c_transport_data; 99 struct rds_ib_connection *ic = conn->c_transport_data;
100 const union rds_ib_conn_priv *dp = NULL;
100 struct ib_qp_attr qp_attr; 101 struct ib_qp_attr qp_attr;
102 __be64 ack_seq = 0;
103 __be32 credit = 0;
104 u8 major = 0;
105 u8 minor = 0;
101 int err; 106 int err;
102 107
103 if (event->param.conn.private_data_len >= sizeof(*dp)) { 108 dp = event->param.conn.private_data;
104 dp = event->param.conn.private_data; 109 if (conn->c_isv6) {
105 110 if (event->param.conn.private_data_len >=
106 /* make sure it isn't empty data */ 111 sizeof(struct rds6_ib_connect_private)) {
107 if (dp->dp_protocol_major) { 112 major = dp->ricp_v6.dp_protocol_major;
108 rds_ib_set_protocol(conn, 113 minor = dp->ricp_v6.dp_protocol_minor;
109 RDS_PROTOCOL(dp->dp_protocol_major, 114 credit = dp->ricp_v6.dp_credit;
110 dp->dp_protocol_minor)); 115 /* dp structure start is not guaranteed to be 8 bytes
111 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 116 * aligned. Since dp_ack_seq is 64-bit extended load
117 * operations can be used so go through get_unaligned
118 * to avoid unaligned errors.
119 */
120 ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
112 } 121 }
122 } else if (event->param.conn.private_data_len >=
123 sizeof(struct rds_ib_connect_private)) {
124 major = dp->ricp_v4.dp_protocol_major;
125 minor = dp->ricp_v4.dp_protocol_minor;
126 credit = dp->ricp_v4.dp_credit;
127 ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
128 }
129
130 /* make sure it isn't empty data */
131 if (major) {
132 rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
133 rds_ib_set_flow_control(conn, be32_to_cpu(credit));
113 } 134 }
114 135
115 if (conn->c_version < RDS_PROTOCOL(3, 1)) { 136 if (conn->c_version < RDS_PROTOCOL(3, 1)) {
116 pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", 137 pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
117 &conn->c_laddr, &conn->c_faddr, 138 &conn->c_laddr, &conn->c_faddr,
118 RDS_PROTOCOL_MAJOR(conn->c_version), 139 RDS_PROTOCOL_MAJOR(conn->c_version),
119 RDS_PROTOCOL_MINOR(conn->c_version)); 140 RDS_PROTOCOL_MINOR(conn->c_version));
@@ -121,7 +142,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
121 rds_conn_destroy(conn); 142 rds_conn_destroy(conn);
122 return; 143 return;
123 } else { 144 } else {
124 pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", 145 pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
125 ic->i_active_side ? "Active" : "Passive", 146 ic->i_active_side ? "Active" : "Passive",
126 &conn->c_laddr, &conn->c_faddr, 147 &conn->c_laddr, &conn->c_faddr,
127 RDS_PROTOCOL_MAJOR(conn->c_version), 148 RDS_PROTOCOL_MAJOR(conn->c_version),
@@ -150,7 +171,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
150 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); 171 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
151 172
152 /* update ib_device with this local ipaddr */ 173 /* update ib_device with this local ipaddr */
153 err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); 174 err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
154 if (err) 175 if (err)
155 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", 176 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
156 err); 177 err);
@@ -158,14 +179,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
158 /* If the peer gave us the last packet it saw, process this as if 179 /* If the peer gave us the last packet it saw, process this as if
159 * we had received a regular ACK. */ 180 * we had received a regular ACK. */
160 if (dp) { 181 if (dp) {
161 /* dp structure start is not guaranteed to be 8 bytes aligned. 182 if (ack_seq)
162 * Since dp_ack_seq is 64-bit extended load operations can be 183 rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
163 * used so go through get_unaligned to avoid unaligned errors.
164 */
165 __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
166
167 if (dp_ack_seq)
168 rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
169 NULL); 184 NULL);
170 } 185 }
171 186
@@ -173,11 +188,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
173} 188}
174 189
175static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, 190static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
176 struct rdma_conn_param *conn_param, 191 struct rdma_conn_param *conn_param,
177 struct rds_ib_connect_private *dp, 192 union rds_ib_conn_priv *dp,
178 u32 protocol_version, 193 u32 protocol_version,
179 u32 max_responder_resources, 194 u32 max_responder_resources,
180 u32 max_initiator_depth) 195 u32 max_initiator_depth,
196 bool isv6)
181{ 197{
182 struct rds_ib_connection *ic = conn->c_transport_data; 198 struct rds_ib_connection *ic = conn->c_transport_data;
183 struct rds_ib_device *rds_ibdev = ic->rds_ibdev; 199 struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -193,24 +209,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
193 209
194 if (dp) { 210 if (dp) {
195 memset(dp, 0, sizeof(*dp)); 211 memset(dp, 0, sizeof(*dp));
196 dp->dp_saddr = conn->c_laddr; 212 if (isv6) {
197 dp->dp_daddr = conn->c_faddr; 213 dp->ricp_v6.dp_saddr = conn->c_laddr;
198 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 214 dp->ricp_v6.dp_daddr = conn->c_faddr;
199 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 215 dp->ricp_v6.dp_protocol_major =
200 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 216 RDS_PROTOCOL_MAJOR(protocol_version);
201 dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); 217 dp->ricp_v6.dp_protocol_minor =
218 RDS_PROTOCOL_MINOR(protocol_version);
219 dp->ricp_v6.dp_protocol_minor_mask =
220 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
221 dp->ricp_v6.dp_ack_seq =
222 cpu_to_be64(rds_ib_piggyb_ack(ic));
223
224 conn_param->private_data = &dp->ricp_v6;
225 conn_param->private_data_len = sizeof(dp->ricp_v6);
226 } else {
227 dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
228 dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
229 dp->ricp_v4.dp_protocol_major =
230 RDS_PROTOCOL_MAJOR(protocol_version);
231 dp->ricp_v4.dp_protocol_minor =
232 RDS_PROTOCOL_MINOR(protocol_version);
233 dp->ricp_v4.dp_protocol_minor_mask =
234 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
235 dp->ricp_v4.dp_ack_seq =
236 cpu_to_be64(rds_ib_piggyb_ack(ic));
237
238 conn_param->private_data = &dp->ricp_v4;
239 conn_param->private_data_len = sizeof(dp->ricp_v4);
240 }
202 241
203 /* Advertise flow control */ 242 /* Advertise flow control */
204 if (ic->i_flowctl) { 243 if (ic->i_flowctl) {
205 unsigned int credits; 244 unsigned int credits;
206 245
207 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); 246 credits = IB_GET_POST_CREDITS
208 dp->dp_credit = cpu_to_be32(credits); 247 (atomic_read(&ic->i_credits));
209 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); 248 if (isv6)
249 dp->ricp_v6.dp_credit = cpu_to_be32(credits);
250 else
251 dp->ricp_v4.dp_credit = cpu_to_be32(credits);
252 atomic_sub(IB_SET_POST_CREDITS(credits),
253 &ic->i_credits);
210 } 254 }
211
212 conn_param->private_data = dp;
213 conn_param->private_data_len = sizeof(*dp);
214 } 255 }
215} 256}
216 257
@@ -349,7 +390,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
349 break; 390 break;
350 default: 391 default:
351 rdsdebug("Fatal QP Event %u (%s) " 392 rdsdebug("Fatal QP Event %u (%s) "
352 "- connection %pI4->%pI4, reconnecting\n", 393 "- connection %pI6c->%pI6c, reconnecting\n",
353 event->event, ib_event_msg(event->event), 394 event->event, ib_event_msg(event->event),
354 &conn->c_laddr, &conn->c_faddr); 395 &conn->c_laddr, &conn->c_faddr);
355 rds_conn_drop(conn); 396 rds_conn_drop(conn);
@@ -580,11 +621,13 @@ out:
580 return ret; 621 return ret;
581} 622}
582 623
583static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) 624static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
584{ 625{
585 const struct rds_ib_connect_private *dp = event->param.conn.private_data; 626 const union rds_ib_conn_priv *dp = event->param.conn.private_data;
586 u16 common; 627 u8 data_len, major, minor;
587 u32 version = 0; 628 u32 version = 0;
629 __be16 mask;
630 u16 common;
588 631
589 /* 632 /*
590 * rdma_cm private data is odd - when there is any private data in the 633 * rdma_cm private data is odd - when there is any private data in the
@@ -603,51 +646,140 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
603 return 0; 646 return 0;
604 } 647 }
605 648
649 if (isv6) {
650 data_len = sizeof(struct rds6_ib_connect_private);
651 major = dp->ricp_v6.dp_protocol_major;
652 minor = dp->ricp_v6.dp_protocol_minor;
653 mask = dp->ricp_v6.dp_protocol_minor_mask;
654 } else {
655 data_len = sizeof(struct rds_ib_connect_private);
656 major = dp->ricp_v4.dp_protocol_major;
657 minor = dp->ricp_v4.dp_protocol_minor;
658 mask = dp->ricp_v4.dp_protocol_minor_mask;
659 }
660
606 /* Even if len is crap *now* I still want to check it. -ASG */ 661 /* Even if len is crap *now* I still want to check it. -ASG */
607 if (event->param.conn.private_data_len < sizeof (*dp) || 662 if (event->param.conn.private_data_len < data_len || major == 0)
608 dp->dp_protocol_major == 0)
609 return RDS_PROTOCOL_3_0; 663 return RDS_PROTOCOL_3_0;
610 664
611 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; 665 common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
612 if (dp->dp_protocol_major == 3 && common) { 666 if (major == 3 && common) {
613 version = RDS_PROTOCOL_3_0; 667 version = RDS_PROTOCOL_3_0;
614 while ((common >>= 1) != 0) 668 while ((common >>= 1) != 0)
615 version++; 669 version++;
616 } else 670 } else {
617 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", 671 if (isv6)
618 &dp->dp_saddr, 672 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
619 dp->dp_protocol_major, 673 &dp->ricp_v6.dp_saddr, major, minor);
620 dp->dp_protocol_minor); 674 else
675 printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
676 &dp->ricp_v4.dp_saddr, major, minor);
677 }
621 return version; 678 return version;
622} 679}
623 680
681#if IS_ENABLED(CONFIG_IPV6)
682/* Given an IPv6 address, find the net_device which hosts that address and
683 * return its index. This is used by the rds_ib_cm_handle_connect() code to
684 * find the interface index of where an incoming request comes from when
685 * the request is using a link local address.
686 *
687 * Note one problem in this search. It is possible that two interfaces have
688 * the same link local address. Unfortunately, this cannot be solved unless
689 * the underlying layer gives us the interface which an incoming RDMA connect
690 * request comes from.
691 */
692static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
693{
694 struct net_device *dev;
695 int idx = 0;
696
697 rcu_read_lock();
698 for_each_netdev_rcu(net, dev) {
699 if (ipv6_chk_addr(net, addr, dev, 1)) {
700 idx = dev->ifindex;
701 break;
702 }
703 }
704 rcu_read_unlock();
705
706 return idx;
707}
708#endif
709
624int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 710int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
625 struct rdma_cm_event *event) 711 struct rdma_cm_event *event, bool isv6)
626{ 712{
627 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; 713 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
628 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; 714 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
629 const struct rds_ib_connect_private *dp = event->param.conn.private_data; 715 const struct rds_ib_conn_priv_cmn *dp_cmn;
630 struct rds_ib_connect_private dp_rep;
631 struct rds_connection *conn = NULL; 716 struct rds_connection *conn = NULL;
632 struct rds_ib_connection *ic = NULL; 717 struct rds_ib_connection *ic = NULL;
633 struct rdma_conn_param conn_param; 718 struct rdma_conn_param conn_param;
719 const union rds_ib_conn_priv *dp;
720 union rds_ib_conn_priv dp_rep;
721 struct in6_addr s_mapped_addr;
722 struct in6_addr d_mapped_addr;
723 const struct in6_addr *saddr6;
724 const struct in6_addr *daddr6;
725 int destroy = 1;
726 u32 ifindex = 0;
634 u32 version; 727 u32 version;
635 int err = 1, destroy = 1; 728 int err = 1;
636 729
637 /* Check whether the remote protocol version matches ours. */ 730 /* Check whether the remote protocol version matches ours. */
638 version = rds_ib_protocol_compatible(event); 731 version = rds_ib_protocol_compatible(event, isv6);
639 if (!version) 732 if (!version)
640 goto out; 733 goto out;
641 734
642 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " 735 dp = event->param.conn.private_data;
643 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, 736 if (isv6) {
737#if IS_ENABLED(CONFIG_IPV6)
738 dp_cmn = &dp->ricp_v6.dp_cmn;
739 saddr6 = &dp->ricp_v6.dp_saddr;
740 daddr6 = &dp->ricp_v6.dp_daddr;
741 /* If either address is link local, need to find the
742 * interface index in order to create a proper RDS
743 * connection.
744 */
745 if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
746 /* Using init_net for now .. */
747 ifindex = __rds_find_ifindex(&init_net, daddr6);
748 /* No index found... Need to bail out. */
749 if (ifindex == 0) {
750 err = -EOPNOTSUPP;
751 goto out;
752 }
753 } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
754 /* Use our address to find the correct index. */
755 ifindex = __rds_find_ifindex(&init_net, daddr6);
756 /* No index found... Need to bail out. */
757 if (ifindex == 0) {
758 err = -EOPNOTSUPP;
759 goto out;
760 }
761 }
762#else
763 err = -EOPNOTSUPP;
764 goto out;
765#endif
766 } else {
767 dp_cmn = &dp->ricp_v4.dp_cmn;
768 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
769 ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
770 saddr6 = &s_mapped_addr;
771 daddr6 = &d_mapped_addr;
772 }
773
774 rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
775 "0x%llx\n", saddr6, daddr6,
644 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 776 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
645 (unsigned long long)be64_to_cpu(lguid), 777 (unsigned long long)be64_to_cpu(lguid),
646 (unsigned long long)be64_to_cpu(fguid)); 778 (unsigned long long)be64_to_cpu(fguid));
647 779
648 /* RDS/IB is not currently netns aware, thus init_net */ 780 /* RDS/IB is not currently netns aware, thus init_net */
649 conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, 781 conn = rds_conn_create(&init_net, daddr6, saddr6,
650 &rds_ib_transport, GFP_KERNEL); 782 &rds_ib_transport, GFP_KERNEL, ifindex);
651 if (IS_ERR(conn)) { 783 if (IS_ERR(conn)) {
652 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); 784 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
653 conn = NULL; 785 conn = NULL;
@@ -678,12 +810,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
678 ic = conn->c_transport_data; 810 ic = conn->c_transport_data;
679 811
680 rds_ib_set_protocol(conn, version); 812 rds_ib_set_protocol(conn, version);
681 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 813 rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
682 814
683 /* If the peer gave us the last packet it saw, process this as if 815 /* If the peer gave us the last packet it saw, process this as if
684 * we had received a regular ACK. */ 816 * we had received a regular ACK. */
685 if (dp->dp_ack_seq) 817 if (dp_cmn->ricpc_ack_seq)
686 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 818 rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
819 NULL);
687 820
688 BUG_ON(cm_id->context); 821 BUG_ON(cm_id->context);
689 BUG_ON(ic->i_cm_id); 822 BUG_ON(ic->i_cm_id);
@@ -702,8 +835,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
702 } 835 }
703 836
704 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, 837 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
705 event->param.conn.responder_resources, 838 event->param.conn.responder_resources,
706 event->param.conn.initiator_depth); 839 event->param.conn.initiator_depth, isv6);
707 840
708 /* rdma_accept() calls rdma_reject() internally if it fails */ 841 /* rdma_accept() calls rdma_reject() internally if it fails */
709 if (rdma_accept(cm_id, &conn_param)) 842 if (rdma_accept(cm_id, &conn_param))
@@ -718,12 +851,12 @@ out:
718} 851}
719 852
720 853
721int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 854int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
722{ 855{
723 struct rds_connection *conn = cm_id->context; 856 struct rds_connection *conn = cm_id->context;
724 struct rds_ib_connection *ic = conn->c_transport_data; 857 struct rds_ib_connection *ic = conn->c_transport_data;
725 struct rdma_conn_param conn_param; 858 struct rdma_conn_param conn_param;
726 struct rds_ib_connect_private dp; 859 union rds_ib_conn_priv dp;
727 int ret; 860 int ret;
728 861
729 /* If the peer doesn't do protocol negotiation, we must 862 /* If the peer doesn't do protocol negotiation, we must
@@ -738,7 +871,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
738 } 871 }
739 872
740 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, 873 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
741 UINT_MAX, UINT_MAX); 874 UINT_MAX, UINT_MAX, isv6);
742 ret = rdma_connect(cm_id, &conn_param); 875 ret = rdma_connect(cm_id, &conn_param);
743 if (ret) 876 if (ret)
744 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); 877 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -758,13 +891,22 @@ out:
758int rds_ib_conn_path_connect(struct rds_conn_path *cp) 891int rds_ib_conn_path_connect(struct rds_conn_path *cp)
759{ 892{
760 struct rds_connection *conn = cp->cp_conn; 893 struct rds_connection *conn = cp->cp_conn;
761 struct rds_ib_connection *ic = conn->c_transport_data; 894 struct sockaddr_storage src, dest;
762 struct sockaddr_in src, dest; 895 rdma_cm_event_handler handler;
896 struct rds_ib_connection *ic;
763 int ret; 897 int ret;
764 898
899 ic = conn->c_transport_data;
900
765 /* XXX I wonder what affect the port space has */ 901 /* XXX I wonder what affect the port space has */
766 /* delegate cm event handler to rdma_transport */ 902 /* delegate cm event handler to rdma_transport */
767 ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, 903#if IS_ENABLED(CONFIG_IPV6)
904 if (conn->c_isv6)
905 handler = rds6_rdma_cm_event_handler;
906 else
907#endif
908 handler = rds_rdma_cm_event_handler;
909 ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
768 RDMA_PS_TCP, IB_QPT_RC); 910 RDMA_PS_TCP, IB_QPT_RC);
769 if (IS_ERR(ic->i_cm_id)) { 911 if (IS_ERR(ic->i_cm_id)) {
770 ret = PTR_ERR(ic->i_cm_id); 912 ret = PTR_ERR(ic->i_cm_id);
@@ -775,13 +917,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
775 917
776 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); 918 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
777 919
778 src.sin_family = AF_INET; 920 if (ipv6_addr_v4mapped(&conn->c_faddr)) {
779 src.sin_addr.s_addr = (__force u32)conn->c_laddr; 921 struct sockaddr_in *sin;
780 src.sin_port = (__force u16)htons(0); 922
923 sin = (struct sockaddr_in *)&src;
924 sin->sin_family = AF_INET;
925 sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
926 sin->sin_port = 0;
781 927
782 dest.sin_family = AF_INET; 928 sin = (struct sockaddr_in *)&dest;
783 dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 929 sin->sin_family = AF_INET;
784 dest.sin_port = (__force u16)htons(RDS_PORT); 930 sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
931 sin->sin_port = htons(RDS_PORT);
932 } else {
933 struct sockaddr_in6 *sin6;
934
935 sin6 = (struct sockaddr_in6 *)&src;
936 sin6->sin6_family = AF_INET6;
937 sin6->sin6_addr = conn->c_laddr;
938 sin6->sin6_port = 0;
939 sin6->sin6_scope_id = conn->c_dev_if;
940
941 sin6 = (struct sockaddr_in6 *)&dest;
942 sin6->sin6_family = AF_INET6;
943 sin6->sin6_addr = conn->c_faddr;
944 sin6->sin6_port = htons(RDS_CM_PORT);
945 sin6->sin6_scope_id = conn->c_dev_if;
946 }
785 947
786 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 948 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
787 (struct sockaddr *)&dest, 949 (struct sockaddr *)&dest,
@@ -949,7 +1111,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
949 if (!ic) 1111 if (!ic)
950 return -ENOMEM; 1112 return -ENOMEM;
951 1113
952 ret = rds_ib_recv_alloc_caches(ic); 1114 ret = rds_ib_recv_alloc_caches(ic, gfp);
953 if (ret) { 1115 if (ret) {
954 kfree(ic); 1116 kfree(ic);
955 return ret; 1117 return ret;
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index b371cf08b1fc..6431a023ac89 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -61,6 +61,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
61 pool->fmr_attr.max_pages); 61 pool->fmr_attr.max_pages);
62 if (IS_ERR(frmr->mr)) { 62 if (IS_ERR(frmr->mr)) {
63 pr_warn("RDS/IB: %s failed to allocate MR", __func__); 63 pr_warn("RDS/IB: %s failed to allocate MR", __func__);
64 err = PTR_ERR(frmr->mr);
64 goto out_no_cigar; 65 goto out_no_cigar;
65 } 66 }
66 67
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 655f01d427fe..5da12c248431 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -113,6 +113,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
113 int npages); 113 int npages);
114void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, 114void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
115 struct rds_info_rdma_connection *iinfo); 115 struct rds_info_rdma_connection *iinfo);
116void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
117 struct rds6_info_rdma_connection *iinfo6);
116void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); 118void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
117void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, 119void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
118 struct rds_sock *rs, u32 *key_ret, 120 struct rds_sock *rs, u32 *key_ret,
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 2e49a40a5e11..63c8d107adcf 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
100 kfree_rcu(to_free, rcu); 100 kfree_rcu(to_free, rcu);
101} 101}
102 102
103int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) 103int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
104 struct in6_addr *ipaddr)
104{ 105{
105 struct rds_ib_device *rds_ibdev_old; 106 struct rds_ib_device *rds_ibdev_old;
106 107
107 rds_ibdev_old = rds_ib_get_device(ipaddr); 108 rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
108 if (!rds_ibdev_old) 109 if (!rds_ibdev_old)
109 return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 110 return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
110 111
111 if (rds_ibdev_old != rds_ibdev) { 112 if (rds_ibdev_old != rds_ibdev) {
112 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); 113 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
113 rds_ib_dev_put(rds_ibdev_old); 114 rds_ib_dev_put(rds_ibdev_old);
114 return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 115 return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
115 } 116 }
116 rds_ib_dev_put(rds_ibdev_old); 117 rds_ib_dev_put(rds_ibdev_old);
117 118
@@ -179,6 +180,17 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
179 iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; 180 iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
180} 181}
181 182
183#if IS_ENABLED(CONFIG_IPV6)
184void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
185 struct rds6_info_rdma_connection *iinfo6)
186{
187 struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
188
189 iinfo6->rdma_mr_max = pool_1m->max_items;
190 iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
191}
192#endif
193
182struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) 194struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
183{ 195{
184 struct rds_ib_mr *ibmr = NULL; 196 struct rds_ib_mr *ibmr = NULL;
@@ -545,7 +557,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
545 struct rds_ib_connection *ic = NULL; 557 struct rds_ib_connection *ic = NULL;
546 int ret; 558 int ret;
547 559
548 rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); 560 rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
549 if (!rds_ibdev) { 561 if (!rds_ibdev) {
550 ret = -ENODEV; 562 ret = -ENODEV;
551 goto out; 563 goto out;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4c5a937304b2..2f16146e4ec9 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -98,12 +98,12 @@ static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
98 } 98 }
99} 99}
100 100
101static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) 101static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
102{ 102{
103 struct rds_ib_cache_head *head; 103 struct rds_ib_cache_head *head;
104 int cpu; 104 int cpu;
105 105
106 cache->percpu = alloc_percpu(struct rds_ib_cache_head); 106 cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
107 if (!cache->percpu) 107 if (!cache->percpu)
108 return -ENOMEM; 108 return -ENOMEM;
109 109
@@ -118,13 +118,13 @@ static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
118 return 0; 118 return 0;
119} 119}
120 120
121int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) 121int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
122{ 122{
123 int ret; 123 int ret;
124 124
125 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); 125 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
126 if (!ret) { 126 if (!ret) {
127 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); 127 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
128 if (ret) 128 if (ret)
129 free_percpu(ic->i_cache_incs.percpu); 129 free_percpu(ic->i_cache_incs.percpu);
130 } 130 }
@@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
266 rds_ib_stats_inc(s_ib_rx_total_incs); 266 rds_ib_stats_inc(s_ib_rx_total_incs);
267 } 267 }
268 INIT_LIST_HEAD(&ibinc->ii_frags); 268 INIT_LIST_HEAD(&ibinc->ii_frags);
269 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); 269 rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
270 270
271 return ibinc; 271 return ibinc;
272} 272}
@@ -376,8 +376,6 @@ static void release_refill(struct rds_connection *conn)
376 * This tries to allocate and post unused work requests after making sure that 376 * This tries to allocate and post unused work requests after making sure that
377 * they have all the allocations they need to queue received fragments into 377 * they have all the allocations they need to queue received fragments into
378 * sockets. 378 * sockets.
379 *
380 * -1 is returned if posting fails due to temporary resource exhaustion.
381 */ 379 */
382void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) 380void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
383{ 381{
@@ -419,7 +417,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
419 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL); 417 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
420 if (ret) { 418 if (ret) {
421 rds_ib_conn_error(conn, "recv post on " 419 rds_ib_conn_error(conn, "recv post on "
422 "%pI4 returned %d, disconnecting and " 420 "%pI6c returned %d, disconnecting and "
423 "reconnecting\n", &conn->c_faddr, 421 "reconnecting\n", &conn->c_faddr,
424 ret); 422 ret);
425 break; 423 break;
@@ -848,7 +846,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
848 846
849 if (data_len < sizeof(struct rds_header)) { 847 if (data_len < sizeof(struct rds_header)) {
850 rds_ib_conn_error(conn, "incoming message " 848 rds_ib_conn_error(conn, "incoming message "
851 "from %pI4 didn't include a " 849 "from %pI6c didn't include a "
852 "header, disconnecting and " 850 "header, disconnecting and "
853 "reconnecting\n", 851 "reconnecting\n",
854 &conn->c_faddr); 852 &conn->c_faddr);
@@ -861,7 +859,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
861 /* Validate the checksum. */ 859 /* Validate the checksum. */
862 if (!rds_message_verify_checksum(ihdr)) { 860 if (!rds_message_verify_checksum(ihdr)) {
863 rds_ib_conn_error(conn, "incoming message " 861 rds_ib_conn_error(conn, "incoming message "
864 "from %pI4 has corrupted header - " 862 "from %pI6c has corrupted header - "
865 "forcing a reconnect\n", 863 "forcing a reconnect\n",
866 &conn->c_faddr); 864 &conn->c_faddr);
867 rds_stats_inc(s_recv_drop_bad_checksum); 865 rds_stats_inc(s_recv_drop_bad_checksum);
@@ -941,10 +939,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
941 ic->i_recv_data_rem = 0; 939 ic->i_recv_data_rem = 0;
942 ic->i_ibinc = NULL; 940 ic->i_ibinc = NULL;
943 941
944 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 942 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
945 rds_ib_cong_recv(conn, ibinc); 943 rds_ib_cong_recv(conn, ibinc);
946 else { 944 } else {
947 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, 945 rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
948 &ibinc->ii_inc, GFP_ATOMIC); 946 &ibinc->ii_inc, GFP_ATOMIC);
949 state->ack_next = be64_to_cpu(hdr->h_sequence); 947 state->ack_next = be64_to_cpu(hdr->h_sequence);
950 state->ack_next_valid = 1; 948 state->ack_next_valid = 1;
@@ -988,7 +986,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
988 } else { 986 } else {
989 /* We expect errors as the qp is drained during shutdown */ 987 /* We expect errors as the qp is drained during shutdown */
990 if (rds_conn_up(conn) || rds_conn_connecting(conn)) 988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
991 rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", 989 rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
992 &conn->c_laddr, &conn->c_faddr, 990 &conn->c_laddr, &conn->c_faddr,
993 wc->status, 991 wc->status,
994 ib_wc_status_msg(wc->status)); 992 ib_wc_status_msg(wc->status));
@@ -1023,7 +1021,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
1023{ 1021{
1024 struct rds_connection *conn = cp->cp_conn; 1022 struct rds_connection *conn = cp->cp_conn;
1025 struct rds_ib_connection *ic = conn->c_transport_data; 1023 struct rds_ib_connection *ic = conn->c_transport_data;
1026 int ret = 0;
1027 1024
1028 rdsdebug("conn %p\n", conn); 1025 rdsdebug("conn %p\n", conn);
1029 if (rds_conn_up(conn)) { 1026 if (rds_conn_up(conn)) {
@@ -1032,7 +1029,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
1032 rds_ib_stats_inc(s_ib_rx_refill_from_thread); 1029 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
1033 } 1030 }
1034 1031
1035 return ret; 1032 return 0;
1036} 1033}
1037 1034
1038int rds_ib_recv_init(void) 1035int rds_ib_recv_init(void)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 8ac80c1b051e..2dcb555e6350 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
305 305
306 /* We expect errors as the qp is drained during shutdown */ 306 /* We expect errors as the qp is drained during shutdown */
307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
308 rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", 308 rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
309 &conn->c_laddr, &conn->c_faddr, wc->status, 309 &conn->c_laddr, &conn->c_faddr, wc->status,
310 ib_wc_status_msg(wc->status)); 310 ib_wc_status_msg(wc->status));
311 } 311 }
@@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
730 first, &first->s_wr, ret, failed_wr); 730 first, &first->s_wr, ret, failed_wr);
731 BUG_ON(failed_wr != &first->s_wr); 731 BUG_ON(failed_wr != &first->s_wr);
732 if (ret) { 732 if (ret) {
733 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " 733 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
734 "returned %d\n", &conn->c_faddr, ret); 734 "returned %d\n", &conn->c_faddr, ret);
735 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 735 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
736 rds_ib_sub_signaled(ic, nr_sig); 736 rds_ib_sub_signaled(ic, nr_sig);
@@ -759,14 +759,11 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
759 struct rds_ib_connection *ic = conn->c_transport_data; 759 struct rds_ib_connection *ic = conn->c_transport_data;
760 struct rds_ib_send_work *send = NULL; 760 struct rds_ib_send_work *send = NULL;
761 const struct ib_send_wr *failed_wr; 761 const struct ib_send_wr *failed_wr;
762 struct rds_ib_device *rds_ibdev;
763 u32 pos; 762 u32 pos;
764 u32 work_alloc; 763 u32 work_alloc;
765 int ret; 764 int ret;
766 int nr_sig = 0; 765 int nr_sig = 0;
767 766
768 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
769
770 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); 767 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
771 if (work_alloc != 1) { 768 if (work_alloc != 1) {
772 rds_ib_stats_inc(s_ib_tx_ring_full); 769 rds_ib_stats_inc(s_ib_tx_ring_full);
@@ -827,7 +824,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
827 send, &send->s_atomic_wr, ret, failed_wr); 824 send, &send->s_atomic_wr, ret, failed_wr);
828 BUG_ON(failed_wr != &send->s_atomic_wr.wr); 825 BUG_ON(failed_wr != &send->s_atomic_wr.wr);
829 if (ret) { 826 if (ret) {
830 printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " 827 printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
831 "returned %d\n", &conn->c_faddr, ret); 828 "returned %d\n", &conn->c_faddr, ret);
832 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 829 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
833 rds_ib_sub_signaled(ic, nr_sig); 830 rds_ib_sub_signaled(ic, nr_sig);
@@ -967,7 +964,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
967 first, &first->s_rdma_wr.wr, ret, failed_wr); 964 first, &first->s_rdma_wr.wr, ret, failed_wr);
968 BUG_ON(failed_wr != &first->s_rdma_wr.wr); 965 BUG_ON(failed_wr != &first->s_rdma_wr.wr);
969 if (ret) { 966 if (ret) {
970 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " 967 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
971 "returned %d\n", &conn->c_faddr, ret); 968 "returned %d\n", &conn->c_faddr, ret);
972 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 969 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
973 rds_ib_sub_signaled(ic, nr_sig); 970 rds_ib_sub_signaled(ic, nr_sig);
diff --git a/net/rds/loop.c b/net/rds/loop.c
index feea1f96ee2a..1d73ad79c847 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
35#include <linux/in.h> 35#include <linux/in.h>
36#include <net/net_namespace.h> 36#include <net/net_namespace.h>
37#include <net/netns/generic.h> 37#include <net/netns/generic.h>
38#include <linux/ipv6.h>
38 39
39#include "rds_single_path.h" 40#include "rds_single_path.h"
40#include "rds.h" 41#include "rds.h"
@@ -88,11 +89,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
88 89
89 BUG_ON(hdr_off || sg || off); 90 BUG_ON(hdr_off || sg || off);
90 91
91 rds_inc_init(&rm->m_inc, conn, conn->c_laddr); 92 rds_inc_init(&rm->m_inc, conn, &conn->c_laddr);
92 /* For the embedded inc. Matching put is in loop_inc_free() */ 93 /* For the embedded inc. Matching put is in loop_inc_free() */
93 rds_message_addref(rm); 94 rds_message_addref(rm);
94 95
95 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, 96 rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc,
96 GFP_KERNEL); 97 GFP_KERNEL);
97 98
98 rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), 99 rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
diff --git a/net/rds/message.c b/net/rds/message.c
index a35f76971984..4b00b1152a5f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -514,4 +514,3 @@ void rds_message_unmapped(struct rds_message *rm)
514 wake_up_interruptible(&rm->m_flush_wait); 514 wake_up_interruptible(&rm->m_flush_wait);
515} 515}
516EXPORT_SYMBOL_GPL(rds_message_unmapped); 516EXPORT_SYMBOL_GPL(rds_message_unmapped);
517
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 80920e47f2c7..98237feb607a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2007 Oracle. All rights reserved. 2 * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -184,7 +184,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
184 long i; 184 long i;
185 int ret; 185 int ret;
186 186
187 if (rs->rs_bound_addr == 0 || !rs->rs_transport) { 187 if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
188 ret = -ENOTCONN; /* XXX not a great errno */ 188 ret = -ENOTCONN; /* XXX not a great errno */
189 goto out; 189 goto out;
190 } 190 }
@@ -576,7 +576,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
576 576
577 args = CMSG_DATA(cmsg); 577 args = CMSG_DATA(cmsg);
578 578
579 if (rs->rs_bound_addr == 0) { 579 if (ipv6_addr_any(&rs->rs_bound_addr)) {
580 ret = -ENOTCONN; /* XXX not a great errno */ 580 ret = -ENOTCONN; /* XXX not a great errno */
581 goto out_ret; 581 goto out_ret;
582 } 582 }
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index fc59821f0a27..6b0f57c83a2a 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2009 Oracle. All rights reserved. 2 * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -37,10 +37,15 @@
37#include "rdma_transport.h" 37#include "rdma_transport.h"
38#include "ib.h" 38#include "ib.h"
39 39
40/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
40static struct rdma_cm_id *rds_rdma_listen_id; 41static struct rdma_cm_id *rds_rdma_listen_id;
42#if IS_ENABLED(CONFIG_IPV6)
43static struct rdma_cm_id *rds6_rdma_listen_id;
44#endif
41 45
42int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 46static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
43 struct rdma_cm_event *event) 47 struct rdma_cm_event *event,
48 bool isv6)
44{ 49{
45 /* this can be null in the listening path */ 50 /* this can be null in the listening path */
46 struct rds_connection *conn = cm_id->context; 51 struct rds_connection *conn = cm_id->context;
@@ -72,7 +77,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
72 77
73 switch (event->event) { 78 switch (event->event) {
74 case RDMA_CM_EVENT_CONNECT_REQUEST: 79 case RDMA_CM_EVENT_CONNECT_REQUEST:
75 ret = trans->cm_handle_connect(cm_id, event); 80 ret = trans->cm_handle_connect(cm_id, event, isv6);
76 break; 81 break;
77 82
78 case RDMA_CM_EVENT_ADDR_RESOLVED: 83 case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -90,7 +95,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
90 95
91 ibic = conn->c_transport_data; 96 ibic = conn->c_transport_data;
92 if (ibic && ibic->i_cm_id == cm_id) 97 if (ibic && ibic->i_cm_id == cm_id)
93 ret = trans->cm_initiate_connect(cm_id); 98 ret = trans->cm_initiate_connect(cm_id, isv6);
94 else 99 else
95 rds_conn_drop(conn); 100 rds_conn_drop(conn);
96 } 101 }
@@ -116,14 +121,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
116 121
117 case RDMA_CM_EVENT_DISCONNECTED: 122 case RDMA_CM_EVENT_DISCONNECTED:
118 rdsdebug("DISCONNECT event - dropping connection " 123 rdsdebug("DISCONNECT event - dropping connection "
119 "%pI4->%pI4\n", &conn->c_laddr, 124 "%pI6c->%pI6c\n", &conn->c_laddr,
120 &conn->c_faddr); 125 &conn->c_faddr);
121 rds_conn_drop(conn); 126 rds_conn_drop(conn);
122 break; 127 break;
123 128
124 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 129 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
125 if (conn) { 130 if (conn) {
126 pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", 131 pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n",
127 &conn->c_laddr, &conn->c_faddr); 132 &conn->c_laddr, &conn->c_faddr);
128 rds_conn_drop(conn); 133 rds_conn_drop(conn);
129 } 134 }
@@ -146,13 +151,28 @@ out:
146 return ret; 151 return ret;
147} 152}
148 153
149static int rds_rdma_listen_init(void) 154int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
155 struct rdma_cm_event *event)
156{
157 return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
158}
159
160#if IS_ENABLED(CONFIG_IPV6)
161int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
162 struct rdma_cm_event *event)
163{
164 return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
165}
166#endif
167
168static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
169 struct sockaddr *sa,
170 struct rdma_cm_id **ret_cm_id)
150{ 171{
151 struct sockaddr_in sin;
152 struct rdma_cm_id *cm_id; 172 struct rdma_cm_id *cm_id;
153 int ret; 173 int ret;
154 174
155 cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, 175 cm_id = rdma_create_id(&init_net, handler, NULL,
156 RDMA_PS_TCP, IB_QPT_RC); 176 RDMA_PS_TCP, IB_QPT_RC);
157 if (IS_ERR(cm_id)) { 177 if (IS_ERR(cm_id)) {
158 ret = PTR_ERR(cm_id); 178 ret = PTR_ERR(cm_id);
@@ -161,15 +181,11 @@ static int rds_rdma_listen_init(void)
161 return ret; 181 return ret;
162 } 182 }
163 183
164 sin.sin_family = AF_INET;
165 sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
166 sin.sin_port = (__force u16)htons(RDS_PORT);
167
168 /* 184 /*
169 * XXX I bet this binds the cm_id to a device. If we want to support 185 * XXX I bet this binds the cm_id to a device. If we want to support
170 * fail-over we'll have to take this into consideration. 186 * fail-over we'll have to take this into consideration.
171 */ 187 */
172 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 188 ret = rdma_bind_addr(cm_id, sa);
173 if (ret) { 189 if (ret) {
174 printk(KERN_ERR "RDS/RDMA: failed to setup listener, " 190 printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
175 "rdma_bind_addr() returned %d\n", ret); 191 "rdma_bind_addr() returned %d\n", ret);
@@ -185,7 +201,7 @@ static int rds_rdma_listen_init(void)
185 201
186 rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); 202 rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
187 203
188 rds_rdma_listen_id = cm_id; 204 *ret_cm_id = cm_id;
189 cm_id = NULL; 205 cm_id = NULL;
190out: 206out:
191 if (cm_id) 207 if (cm_id)
@@ -193,6 +209,45 @@ out:
193 return ret; 209 return ret;
194} 210}
195 211
212/* Initialize the RDS RDMA listeners. We create two listeners for
213 * compatibility reason. The one on RDS_PORT is used for IPv4
214 * requests only. The one on RDS_CM_PORT is used for IPv6 requests
215 * only. So only IPv6 enabled RDS module will communicate using this
216 * port.
217 */
218static int rds_rdma_listen_init(void)
219{
220 int ret;
221#if IS_ENABLED(CONFIG_IPV6)
222 struct sockaddr_in6 sin6;
223#endif
224 struct sockaddr_in sin;
225
226 sin.sin_family = PF_INET;
227 sin.sin_addr.s_addr = htonl(INADDR_ANY);
228 sin.sin_port = htons(RDS_PORT);
229 ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
230 (struct sockaddr *)&sin,
231 &rds_rdma_listen_id);
232 if (ret != 0)
233 return ret;
234
235#if IS_ENABLED(CONFIG_IPV6)
236 sin6.sin6_family = PF_INET6;
237 sin6.sin6_addr = in6addr_any;
238 sin6.sin6_port = htons(RDS_CM_PORT);
239 sin6.sin6_scope_id = 0;
240 sin6.sin6_flowinfo = 0;
241 ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
242 (struct sockaddr *)&sin6,
243 &rds6_rdma_listen_id);
244 /* Keep going even when IPv6 is not enabled in the system. */
245 if (ret != 0)
246 rdsdebug("Cannot set up IPv6 RDMA listener\n");
247#endif
248 return 0;
249}
250
196static void rds_rdma_listen_stop(void) 251static void rds_rdma_listen_stop(void)
197{ 252{
198 if (rds_rdma_listen_id) { 253 if (rds_rdma_listen_id) {
@@ -200,6 +255,13 @@ static void rds_rdma_listen_stop(void)
200 rdma_destroy_id(rds_rdma_listen_id); 255 rdma_destroy_id(rds_rdma_listen_id);
201 rds_rdma_listen_id = NULL; 256 rds_rdma_listen_id = NULL;
202 } 257 }
258#if IS_ENABLED(CONFIG_IPV6)
259 if (rds6_rdma_listen_id) {
260 rdsdebug("cm %p\n", rds6_rdma_listen_id);
261 rdma_destroy_id(rds6_rdma_listen_id);
262 rds6_rdma_listen_id = NULL;
263 }
264#endif
203} 265}
204 266
205static int rds_rdma_init(void) 267static int rds_rdma_init(void)
@@ -229,4 +291,3 @@ module_exit(rds_rdma_exit);
229MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 291MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
230MODULE_DESCRIPTION("RDS: IB transport"); 292MODULE_DESCRIPTION("RDS: IB transport");
231MODULE_LICENSE("Dual BSD/GPL"); 293MODULE_LICENSE("Dual BSD/GPL");
232
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index d309c4430124..200d3134aaae 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -6,11 +6,16 @@
6#include <rdma/rdma_cm.h> 6#include <rdma/rdma_cm.h>
7#include "rds.h" 7#include "rds.h"
8 8
9/* RDMA_CM also uses 16385 as the listener port. */
10#define RDS_CM_PORT 16385
11
9#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 12#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
10 13
11int rds_rdma_conn_connect(struct rds_connection *conn); 14int rds_rdma_conn_connect(struct rds_connection *conn);
12int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 15int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
13 struct rdma_cm_event *event); 16 struct rdma_cm_event *event);
17int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
18 struct rdma_cm_event *event);
14 19
15/* from ib.c */ 20/* from ib.c */
16extern struct rds_transport rds_ib_transport; 21extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 60b3b787fbdb..c4dcf654d8fe 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -10,6 +10,7 @@
10#include <linux/rds.h> 10#include <linux/rds.h>
11#include <linux/rhashtable.h> 11#include <linux/rhashtable.h>
12#include <linux/refcount.h> 12#include <linux/refcount.h>
13#include <linux/in6.h>
13 14
14#include "info.h" 15#include "info.h"
15 16
@@ -23,11 +24,13 @@
23#define RDS_PROTOCOL_MINOR(v) ((v) & 255) 24#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
24#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) 25#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
25 26
26/* 27/* The following ports, 16385, 18634, 18635, are registered with IANA as
27 * XXX randomly chosen, but at least seems to be unused: 28 * the ports to be used for RDS over TCP and UDP. Currently, only RDS over
28 * # 18464-18768 Unassigned 29 * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value
29 * We should do better. We want a reserved port to discourage unpriv'ed 30 * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After
30 * userspace from listening. 31 * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept
32 * to ensure compatibility with older RDS modules. Those ports are defined
33 * in each transport's header file.
31 */ 34 */
32#define RDS_PORT 18634 35#define RDS_PORT 18634
33 36
@@ -61,7 +64,7 @@ void rdsdebug(char *fmt, ...)
61 64
62struct rds_cong_map { 65struct rds_cong_map {
63 struct rb_node m_rb_node; 66 struct rb_node m_rb_node;
64 __be32 m_addr; 67 struct in6_addr m_addr;
65 wait_queue_head_t m_waitq; 68 wait_queue_head_t m_waitq;
66 struct list_head m_conn_list; 69 struct list_head m_conn_list;
67 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; 70 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
@@ -136,11 +139,14 @@ struct rds_conn_path {
136/* One rds_connection per RDS address pair */ 139/* One rds_connection per RDS address pair */
137struct rds_connection { 140struct rds_connection {
138 struct hlist_node c_hash_node; 141 struct hlist_node c_hash_node;
139 __be32 c_laddr; 142 struct in6_addr c_laddr;
140 __be32 c_faddr; 143 struct in6_addr c_faddr;
144 int c_dev_if; /* ifindex used for this conn */
145 int c_bound_if; /* ifindex of c_laddr */
141 unsigned int c_loopback:1, 146 unsigned int c_loopback:1,
147 c_isv6:1,
142 c_ping_triggered:1, 148 c_ping_triggered:1,
143 c_pad_to_32:30; 149 c_pad_to_32:29;
144 int c_npaths; 150 int c_npaths;
145 struct rds_connection *c_passive; 151 struct rds_connection *c_passive;
146 struct rds_transport *c_trans; 152 struct rds_transport *c_trans;
@@ -269,7 +275,7 @@ struct rds_incoming {
269 struct rds_conn_path *i_conn_path; 275 struct rds_conn_path *i_conn_path;
270 struct rds_header i_hdr; 276 struct rds_header i_hdr;
271 unsigned long i_rx_jiffies; 277 unsigned long i_rx_jiffies;
272 __be32 i_saddr; 278 struct in6_addr i_saddr;
273 279
274 rds_rdma_cookie_t i_rdma_cookie; 280 rds_rdma_cookie_t i_rdma_cookie;
275 struct timeval i_rx_tstamp; 281 struct timeval i_rx_tstamp;
@@ -386,7 +392,7 @@ struct rds_message {
386 struct list_head m_conn_item; 392 struct list_head m_conn_item;
387 struct rds_incoming m_inc; 393 struct rds_incoming m_inc;
388 u64 m_ack_seq; 394 u64 m_ack_seq;
389 __be32 m_daddr; 395 struct in6_addr m_daddr;
390 unsigned long m_flags; 396 unsigned long m_flags;
391 397
392 /* Never access m_rs without holding m_rs_lock. 398 /* Never access m_rs without holding m_rs_lock.
@@ -521,7 +527,8 @@ struct rds_transport {
521 t_mp_capable:1; 527 t_mp_capable:1;
522 unsigned int t_type; 528 unsigned int t_type;
523 529
524 int (*laddr_check)(struct net *net, __be32 addr); 530 int (*laddr_check)(struct net *net, const struct in6_addr *addr,
531 __u32 scope_id);
525 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); 532 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
526 void (*conn_free)(void *data); 533 void (*conn_free)(void *data);
527 int (*conn_path_connect)(struct rds_conn_path *cp); 534 int (*conn_path_connect)(struct rds_conn_path *cp);
@@ -537,8 +544,8 @@ struct rds_transport {
537 void (*inc_free)(struct rds_incoming *inc); 544 void (*inc_free)(struct rds_incoming *inc);
538 545
539 int (*cm_handle_connect)(struct rdma_cm_id *cm_id, 546 int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
540 struct rdma_cm_event *event); 547 struct rdma_cm_event *event, bool isv6);
541 int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); 548 int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
542 void (*cm_connect_complete)(struct rds_connection *conn, 549 void (*cm_connect_complete)(struct rds_connection *conn,
543 struct rdma_cm_event *event); 550 struct rdma_cm_event *event);
544 551
@@ -554,6 +561,12 @@ struct rds_transport {
554 bool (*t_unloading)(struct rds_connection *conn); 561 bool (*t_unloading)(struct rds_connection *conn);
555}; 562};
556 563
564/* Bind hash table key length. It is the sum of the size of a struct
565 * in6_addr, a scope_id and a port.
566 */
567#define RDS_BOUND_KEY_LEN \
568 (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
569
557struct rds_sock { 570struct rds_sock {
558 struct sock rs_sk; 571 struct sock rs_sk;
559 572
@@ -565,10 +578,14 @@ struct rds_sock {
565 * support. 578 * support.
566 */ 579 */
567 struct rhash_head rs_bound_node; 580 struct rhash_head rs_bound_node;
568 u64 rs_bound_key; 581 u8 rs_bound_key[RDS_BOUND_KEY_LEN];
569 __be32 rs_bound_addr; 582 struct sockaddr_in6 rs_bound_sin6;
570 __be32 rs_conn_addr; 583#define rs_bound_addr rs_bound_sin6.sin6_addr
571 __be16 rs_bound_port; 584#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3]
585#define rs_bound_port rs_bound_sin6.sin6_port
586#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id
587 struct in6_addr rs_conn_addr;
588#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3]
572 __be16 rs_conn_port; 589 __be16 rs_conn_port;
573 struct rds_transport *rs_transport; 590 struct rds_transport *rs_transport;
574 591
@@ -704,7 +721,8 @@ extern wait_queue_head_t rds_poll_waitq;
704/* bind.c */ 721/* bind.c */
705int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); 722int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
706void rds_remove_bound(struct rds_sock *rs); 723void rds_remove_bound(struct rds_sock *rs);
707struct rds_sock *rds_find_bound(__be32 addr, __be16 port); 724struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
725 __u32 scope_id);
708int rds_bind_lock_init(void); 726int rds_bind_lock_init(void);
709void rds_bind_lock_destroy(void); 727void rds_bind_lock_destroy(void);
710 728
@@ -723,16 +741,20 @@ void rds_cong_remove_socket(struct rds_sock *);
723void rds_cong_exit(void); 741void rds_cong_exit(void);
724struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); 742struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
725 743
726/* conn.c */ 744/* connection.c */
727extern u32 rds_gen_num; 745extern u32 rds_gen_num;
728int rds_conn_init(void); 746int rds_conn_init(void);
729void rds_conn_exit(void); 747void rds_conn_exit(void);
730struct rds_connection *rds_conn_create(struct net *net, 748struct rds_connection *rds_conn_create(struct net *net,
731 __be32 laddr, __be32 faddr, 749 const struct in6_addr *laddr,
732 struct rds_transport *trans, gfp_t gfp); 750 const struct in6_addr *faddr,
751 struct rds_transport *trans, gfp_t gfp,
752 int dev_if);
733struct rds_connection *rds_conn_create_outgoing(struct net *net, 753struct rds_connection *rds_conn_create_outgoing(struct net *net,
734 __be32 laddr, __be32 faddr, 754 const struct in6_addr *laddr,
735 struct rds_transport *trans, gfp_t gfp); 755 const struct in6_addr *faddr,
756 struct rds_transport *trans,
757 gfp_t gfp, int dev_if);
736void rds_conn_shutdown(struct rds_conn_path *cpath); 758void rds_conn_shutdown(struct rds_conn_path *cpath);
737void rds_conn_destroy(struct rds_connection *conn); 759void rds_conn_destroy(struct rds_connection *conn);
738void rds_conn_drop(struct rds_connection *conn); 760void rds_conn_drop(struct rds_connection *conn);
@@ -843,11 +865,12 @@ void rds_page_exit(void);
843 865
844/* recv.c */ 866/* recv.c */
845void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 867void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
846 __be32 saddr); 868 struct in6_addr *saddr);
847void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, 869void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
848 __be32 saddr); 870 struct in6_addr *saddr);
849void rds_inc_put(struct rds_incoming *inc); 871void rds_inc_put(struct rds_incoming *inc);
850void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 872void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
873 struct in6_addr *daddr,
851 struct rds_incoming *inc, gfp_t gfp); 874 struct rds_incoming *inc, gfp_t gfp);
852int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 875int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
853 int msg_flags); 876 int msg_flags);
@@ -856,13 +879,17 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
856void rds_inc_info_copy(struct rds_incoming *inc, 879void rds_inc_info_copy(struct rds_incoming *inc,
857 struct rds_info_iterator *iter, 880 struct rds_info_iterator *iter,
858 __be32 saddr, __be32 daddr, int flip); 881 __be32 saddr, __be32 daddr, int flip);
882void rds6_inc_info_copy(struct rds_incoming *inc,
883 struct rds_info_iterator *iter,
884 struct in6_addr *saddr, struct in6_addr *daddr,
885 int flip);
859 886
860/* send.c */ 887/* send.c */
861int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); 888int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
862void rds_send_path_reset(struct rds_conn_path *conn); 889void rds_send_path_reset(struct rds_conn_path *conn);
863int rds_send_xmit(struct rds_conn_path *cp); 890int rds_send_xmit(struct rds_conn_path *cp);
864struct sockaddr_in; 891struct sockaddr_in;
865void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); 892void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
866typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); 893typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
867void rds_send_drop_acked(struct rds_connection *conn, u64 ack, 894void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
868 is_acked_func is_acked); 895 is_acked_func is_acked);
@@ -949,11 +976,14 @@ void rds_send_worker(struct work_struct *);
949void rds_recv_worker(struct work_struct *); 976void rds_recv_worker(struct work_struct *);
950void rds_connect_path_complete(struct rds_conn_path *conn, int curr); 977void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
951void rds_connect_complete(struct rds_connection *conn); 978void rds_connect_complete(struct rds_connection *conn);
979int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
952 980
953/* transport.c */ 981/* transport.c */
954void rds_trans_register(struct rds_transport *trans); 982void rds_trans_register(struct rds_transport *trans);
955void rds_trans_unregister(struct rds_transport *trans); 983void rds_trans_unregister(struct rds_transport *trans);
956struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); 984struct rds_transport *rds_trans_get_preferred(struct net *net,
985 const struct in6_addr *addr,
986 __u32 scope_id);
957void rds_trans_put(struct rds_transport *trans); 987void rds_trans_put(struct rds_transport *trans);
958unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, 988unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
959 unsigned int avail); 989 unsigned int avail);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 192ac6f78ded..504cd6bcc54c 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -41,14 +41,14 @@
41#include "rds.h" 41#include "rds.h"
42 42
43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 43void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
44 __be32 saddr) 44 struct in6_addr *saddr)
45{ 45{
46 int i; 46 int i;
47 47
48 refcount_set(&inc->i_refcount, 1); 48 refcount_set(&inc->i_refcount, 1);
49 INIT_LIST_HEAD(&inc->i_item); 49 INIT_LIST_HEAD(&inc->i_item);
50 inc->i_conn = conn; 50 inc->i_conn = conn;
51 inc->i_saddr = saddr; 51 inc->i_saddr = *saddr;
52 inc->i_rdma_cookie = 0; 52 inc->i_rdma_cookie = 0;
53 inc->i_rx_tstamp.tv_sec = 0; 53 inc->i_rx_tstamp.tv_sec = 0;
54 inc->i_rx_tstamp.tv_usec = 0; 54 inc->i_rx_tstamp.tv_usec = 0;
@@ -59,13 +59,13 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
59EXPORT_SYMBOL_GPL(rds_inc_init); 59EXPORT_SYMBOL_GPL(rds_inc_init);
60 60
61void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, 61void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
62 __be32 saddr) 62 struct in6_addr *saddr)
63{ 63{
64 refcount_set(&inc->i_refcount, 1); 64 refcount_set(&inc->i_refcount, 1);
65 INIT_LIST_HEAD(&inc->i_item); 65 INIT_LIST_HEAD(&inc->i_item);
66 inc->i_conn = cp->cp_conn; 66 inc->i_conn = cp->cp_conn;
67 inc->i_conn_path = cp; 67 inc->i_conn_path = cp;
68 inc->i_saddr = saddr; 68 inc->i_saddr = *saddr;
69 inc->i_rdma_cookie = 0; 69 inc->i_rdma_cookie = 0;
70 inc->i_rx_tstamp.tv_sec = 0; 70 inc->i_rx_tstamp.tv_sec = 0;
71 inc->i_rx_tstamp.tv_usec = 0; 71 inc->i_rx_tstamp.tv_usec = 0;
@@ -110,7 +110,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
110 110
111 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); 111 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
112 112
113 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " 113 rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
114 "now_cong %d delta %d\n", 114 "now_cong %d delta %d\n",
115 rs, &rs->rs_bound_addr, 115 rs, &rs->rs_bound_addr,
116 ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, 116 ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
@@ -260,7 +260,7 @@ static void rds_start_mprds(struct rds_connection *conn)
260 struct rds_conn_path *cp; 260 struct rds_conn_path *cp;
261 261
262 if (conn->c_npaths > 1 && 262 if (conn->c_npaths > 1 &&
263 IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { 263 rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
264 for (i = 0; i < conn->c_npaths; i++) { 264 for (i = 0; i < conn->c_npaths; i++) {
265 cp = &conn->c_path[i]; 265 cp = &conn->c_path[i];
266 rds_conn_path_connect_if_down(cp); 266 rds_conn_path_connect_if_down(cp);
@@ -284,7 +284,8 @@ static void rds_start_mprds(struct rds_connection *conn)
284 * conn. This lets loopback, who only has one conn for both directions, 284 * conn. This lets loopback, who only has one conn for both directions,
285 * tell us which roles the addrs in the conn are playing for this message. 285 * tell us which roles the addrs in the conn are playing for this message.
286 */ 286 */
287void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 287void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
288 struct in6_addr *daddr,
288 struct rds_incoming *inc, gfp_t gfp) 289 struct rds_incoming *inc, gfp_t gfp)
289{ 290{
290 struct rds_sock *rs = NULL; 291 struct rds_sock *rs = NULL;
@@ -339,7 +340,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
339 340
340 if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { 341 if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
341 if (inc->i_hdr.h_sport == 0) { 342 if (inc->i_hdr.h_sport == 0) {
342 rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); 343 rdsdebug("ignore ping with 0 sport from %pI6c\n",
344 saddr);
343 goto out; 345 goto out;
344 } 346 }
345 rds_stats_inc(s_recv_ping); 347 rds_stats_inc(s_recv_ping);
@@ -362,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
362 goto out; 364 goto out;
363 } 365 }
364 366
365 rs = rds_find_bound(daddr, inc->i_hdr.h_dport); 367 rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
366 if (!rs) { 368 if (!rs) {
367 rds_stats_inc(s_recv_drop_no_sock); 369 rds_stats_inc(s_recv_drop_no_sock);
368 goto out; 370 goto out;
@@ -625,6 +627,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
625 struct rds_sock *rs = rds_sk_to_rs(sk); 627 struct rds_sock *rs = rds_sk_to_rs(sk);
626 long timeo; 628 long timeo;
627 int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; 629 int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
630 DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
628 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); 631 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
629 struct rds_incoming *inc = NULL; 632 struct rds_incoming *inc = NULL;
630 633
@@ -673,7 +676,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
673 break; 676 break;
674 } 677 }
675 678
676 rdsdebug("copying inc %p from %pI4:%u to user\n", inc, 679 rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
677 &inc->i_conn->c_faddr, 680 &inc->i_conn->c_faddr,
678 ntohs(inc->i_hdr.h_sport)); 681 ntohs(inc->i_hdr.h_sport));
679 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); 682 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
@@ -707,12 +710,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
707 710
708 rds_stats_inc(s_recv_delivered); 711 rds_stats_inc(s_recv_delivered);
709 712
710 if (sin) { 713 if (msg->msg_name) {
711 sin->sin_family = AF_INET; 714 if (ipv6_addr_v4mapped(&inc->i_saddr)) {
712 sin->sin_port = inc->i_hdr.h_sport; 715 sin = (struct sockaddr_in *)msg->msg_name;
713 sin->sin_addr.s_addr = inc->i_saddr; 716
714 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 717 sin->sin_family = AF_INET;
715 msg->msg_namelen = sizeof(*sin); 718 sin->sin_port = inc->i_hdr.h_sport;
719 sin->sin_addr.s_addr =
720 inc->i_saddr.s6_addr32[3];
721 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
722 msg->msg_namelen = sizeof(*sin);
723 } else {
724 sin6 = (struct sockaddr_in6 *)msg->msg_name;
725
726 sin6->sin6_family = AF_INET6;
727 sin6->sin6_port = inc->i_hdr.h_sport;
728 sin6->sin6_addr = inc->i_saddr;
729 sin6->sin6_flowinfo = 0;
730 sin6->sin6_scope_id = rs->rs_bound_scope_id;
731 msg->msg_namelen = sizeof(*sin6);
732 }
716 } 733 }
717 break; 734 break;
718 } 735 }
@@ -775,3 +792,30 @@ void rds_inc_info_copy(struct rds_incoming *inc,
775 792
776 rds_info_copy(iter, &minfo, sizeof(minfo)); 793 rds_info_copy(iter, &minfo, sizeof(minfo));
777} 794}
795
796#if IS_ENABLED(CONFIG_IPV6)
797void rds6_inc_info_copy(struct rds_incoming *inc,
798 struct rds_info_iterator *iter,
799 struct in6_addr *saddr, struct in6_addr *daddr,
800 int flip)
801{
802 struct rds6_info_message minfo6;
803
804 minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
805 minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
806
807 if (flip) {
808 minfo6.laddr = *daddr;
809 minfo6.faddr = *saddr;
810 minfo6.lport = inc->i_hdr.h_dport;
811 minfo6.fport = inc->i_hdr.h_sport;
812 } else {
813 minfo6.laddr = *saddr;
814 minfo6.faddr = *daddr;
815 minfo6.lport = inc->i_hdr.h_sport;
816 minfo6.fport = inc->i_hdr.h_dport;
817 }
818
819 rds_info_copy(iter, &minfo6, sizeof(minfo6));
820}
821#endif
diff --git a/net/rds/send.c b/net/rds/send.c
index 59f17a2335f4..57b3d5a8b2db 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
709} 709}
710EXPORT_SYMBOL_GPL(rds_send_drop_acked); 710EXPORT_SYMBOL_GPL(rds_send_drop_acked);
711 711
712void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) 712void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
713{ 713{
714 struct rds_message *rm, *tmp; 714 struct rds_message *rm, *tmp;
715 struct rds_connection *conn; 715 struct rds_connection *conn;
@@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
721 spin_lock_irqsave(&rs->rs_lock, flags); 721 spin_lock_irqsave(&rs->rs_lock, flags);
722 722
723 list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { 723 list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
724 if (dest && (dest->sin_addr.s_addr != rm->m_daddr || 724 if (dest &&
725 dest->sin_port != rm->m_inc.i_hdr.h_dport)) 725 (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
726 dest->sin6_port != rm->m_inc.i_hdr.h_dport))
726 continue; 727 continue;
727 728
728 list_move(&rm->m_sock_item, &list); 729 list_move(&rm->m_sock_item, &list);
@@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1059{ 1060{
1060 struct sock *sk = sock->sk; 1061 struct sock *sk = sock->sk;
1061 struct rds_sock *rs = rds_sk_to_rs(sk); 1062 struct rds_sock *rs = rds_sk_to_rs(sk);
1063 DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
1062 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); 1064 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
1063 __be32 daddr;
1064 __be16 dport; 1065 __be16 dport;
1065 struct rds_message *rm = NULL; 1066 struct rds_message *rm = NULL;
1066 struct rds_connection *conn; 1067 struct rds_connection *conn;
@@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1069 int nonblock = msg->msg_flags & MSG_DONTWAIT; 1070 int nonblock = msg->msg_flags & MSG_DONTWAIT;
1070 long timeo = sock_sndtimeo(sk, nonblock); 1071 long timeo = sock_sndtimeo(sk, nonblock);
1071 struct rds_conn_path *cpath; 1072 struct rds_conn_path *cpath;
1073 struct in6_addr daddr;
1074 __u32 scope_id = 0;
1072 size_t total_payload_len = payload_len, rdma_payload_len = 0; 1075 size_t total_payload_len = payload_len, rdma_payload_len = 0;
1073 bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && 1076 bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
1074 sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); 1077 sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
1075 int num_sgs = ceil(payload_len, PAGE_SIZE); 1078 int num_sgs = ceil(payload_len, PAGE_SIZE);
1079 int namelen;
1076 1080
1077 /* Mirror Linux UDP mirror of BSD error message compatibility */ 1081 /* Mirror Linux UDP mirror of BSD error message compatibility */
1078 /* XXX: Perhaps MSG_MORE someday */ 1082 /* XXX: Perhaps MSG_MORE someday */
@@ -1081,27 +1085,108 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1081 goto out; 1085 goto out;
1082 } 1086 }
1083 1087
1084 if (msg->msg_namelen) { 1088 namelen = msg->msg_namelen;
1085 /* XXX fail non-unicast destination IPs? */ 1089 if (namelen != 0) {
1086 if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { 1090 if (namelen < sizeof(*usin)) {
1091 ret = -EINVAL;
1092 goto out;
1093 }
1094 switch (usin->sin_family) {
1095 case AF_INET:
1096 if (usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
1097 usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
1098 IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
1099 ret = -EINVAL;
1100 goto out;
1101 }
1102 ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
1103 dport = usin->sin_port;
1104 break;
1105
1106#if IS_ENABLED(CONFIG_IPV6)
1107 case AF_INET6: {
1108 int addr_type;
1109
1110 if (namelen < sizeof(*sin6)) {
1111 ret = -EINVAL;
1112 goto out;
1113 }
1114 addr_type = ipv6_addr_type(&sin6->sin6_addr);
1115 if (!(addr_type & IPV6_ADDR_UNICAST)) {
1116 __be32 addr4;
1117
1118 if (!(addr_type & IPV6_ADDR_MAPPED)) {
1119 ret = -EINVAL;
1120 goto out;
1121 }
1122
1123 /* It is a mapped address. Need to do some
1124 * sanity checks.
1125 */
1126 addr4 = sin6->sin6_addr.s6_addr32[3];
1127 if (addr4 == htonl(INADDR_ANY) ||
1128 addr4 == htonl(INADDR_BROADCAST) ||
1129 IN_MULTICAST(ntohl(addr4))) {
1130 ret = -EINVAL;
1131 goto out;
1132 }
1133 }
1134 if (addr_type & IPV6_ADDR_LINKLOCAL) {
1135 if (sin6->sin6_scope_id == 0) {
1136 ret = -EINVAL;
1137 goto out;
1138 }
1139 scope_id = sin6->sin6_scope_id;
1140 }
1141
1142 daddr = sin6->sin6_addr;
1143 dport = sin6->sin6_port;
1144 break;
1145 }
1146#endif
1147
1148 default:
1087 ret = -EINVAL; 1149 ret = -EINVAL;
1088 goto out; 1150 goto out;
1089 } 1151 }
1090 daddr = usin->sin_addr.s_addr;
1091 dport = usin->sin_port;
1092 } else { 1152 } else {
1093 /* We only care about consistency with ->connect() */ 1153 /* We only care about consistency with ->connect() */
1094 lock_sock(sk); 1154 lock_sock(sk);
1095 daddr = rs->rs_conn_addr; 1155 daddr = rs->rs_conn_addr;
1096 dport = rs->rs_conn_port; 1156 dport = rs->rs_conn_port;
1157 scope_id = rs->rs_bound_scope_id;
1097 release_sock(sk); 1158 release_sock(sk);
1098 } 1159 }
1099 1160
1100 lock_sock(sk); 1161 lock_sock(sk);
1101 if (daddr == 0 || rs->rs_bound_addr == 0) { 1162 if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
1102 release_sock(sk); 1163 release_sock(sk);
1103 ret = -ENOTCONN; /* XXX not a great errno */ 1164 ret = -ENOTCONN;
1104 goto out; 1165 goto out;
1166 } else if (namelen != 0) {
1167 /* Cannot send to an IPv4 address using an IPv6 source
1168 * address and cannot send to an IPv6 address using an
1169 * IPv4 source address.
1170 */
1171 if (ipv6_addr_v4mapped(&daddr) ^
1172 ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
1173 release_sock(sk);
1174 ret = -EOPNOTSUPP;
1175 goto out;
1176 }
1177 /* If the socket is already bound to a link local address,
1178 * it can only send to peers on the same link. But allow
1179 * communicating beween link local and non-link local address.
1180 */
1181 if (scope_id != rs->rs_bound_scope_id) {
1182 if (!scope_id) {
1183 scope_id = rs->rs_bound_scope_id;
1184 } else if (rs->rs_bound_scope_id) {
1185 release_sock(sk);
1186 ret = -EINVAL;
1187 goto out;
1188 }
1189 }
1105 } 1190 }
1106 release_sock(sk); 1191 release_sock(sk);
1107 1192
@@ -1155,13 +1240,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1155 1240
1156 /* rds_conn_create has a spinlock that runs with IRQ off. 1241 /* rds_conn_create has a spinlock that runs with IRQ off.
1157 * Caching the conn in the socket helps a lot. */ 1242 * Caching the conn in the socket helps a lot. */
1158 if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) 1243 if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
1159 conn = rs->rs_conn; 1244 conn = rs->rs_conn;
1160 else { 1245 else {
1161 conn = rds_conn_create_outgoing(sock_net(sock->sk), 1246 conn = rds_conn_create_outgoing(sock_net(sock->sk),
1162 rs->rs_bound_addr, daddr, 1247 &rs->rs_bound_addr, &daddr,
1163 rs->rs_transport, 1248 rs->rs_transport,
1164 sock->sk->sk_allocation); 1249 sock->sk->sk_allocation,
1250 scope_id);
1165 if (IS_ERR(conn)) { 1251 if (IS_ERR(conn)) {
1166 ret = PTR_ERR(conn); 1252 ret = PTR_ERR(conn);
1167 goto out; 1253 goto out;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 351a28474667..2c7b7c352d3e 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,8 @@
37#include <net/tcp.h> 37#include <net/tcp.h>
38#include <net/net_namespace.h> 38#include <net/net_namespace.h>
39#include <net/netns/generic.h> 39#include <net/netns/generic.h>
40#include <net/tcp.h>
41#include <net/addrconf.h>
40 42
41#include "rds.h" 43#include "rds.h"
42#include "tcp.h" 44#include "tcp.h"
@@ -44,7 +46,14 @@
44/* only for info exporting */ 46/* only for info exporting */
45static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); 47static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
46static LIST_HEAD(rds_tcp_tc_list); 48static LIST_HEAD(rds_tcp_tc_list);
49
50/* rds_tcp_tc_count counts only IPv4 connections.
51 * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
52 */
47static unsigned int rds_tcp_tc_count; 53static unsigned int rds_tcp_tc_count;
54#if IS_ENABLED(CONFIG_IPV6)
55static unsigned int rds6_tcp_tc_count;
56#endif
48 57
49/* Track rds_tcp_connection structs so they can be cleaned up */ 58/* Track rds_tcp_connection structs so they can be cleaned up */
50static DEFINE_SPINLOCK(rds_tcp_conn_lock); 59static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -111,7 +120,11 @@ void rds_tcp_restore_callbacks(struct socket *sock,
111 /* done under the callback_lock to serialize with write_space */ 120 /* done under the callback_lock to serialize with write_space */
112 spin_lock(&rds_tcp_tc_list_lock); 121 spin_lock(&rds_tcp_tc_list_lock);
113 list_del_init(&tc->t_list_item); 122 list_del_init(&tc->t_list_item);
114 rds_tcp_tc_count--; 123#if IS_ENABLED(CONFIG_IPV6)
124 rds6_tcp_tc_count--;
125#endif
126 if (!tc->t_cpath->cp_conn->c_isv6)
127 rds_tcp_tc_count--;
115 spin_unlock(&rds_tcp_tc_list_lock); 128 spin_unlock(&rds_tcp_tc_list_lock);
116 129
117 tc->t_sock = NULL; 130 tc->t_sock = NULL;
@@ -198,7 +211,11 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
198 /* done under the callback_lock to serialize with write_space */ 211 /* done under the callback_lock to serialize with write_space */
199 spin_lock(&rds_tcp_tc_list_lock); 212 spin_lock(&rds_tcp_tc_list_lock);
200 list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); 213 list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
201 rds_tcp_tc_count++; 214#if IS_ENABLED(CONFIG_IPV6)
215 rds6_tcp_tc_count++;
216#endif
217 if (!tc->t_cpath->cp_conn->c_isv6)
218 rds_tcp_tc_count++;
202 spin_unlock(&rds_tcp_tc_list_lock); 219 spin_unlock(&rds_tcp_tc_list_lock);
203 220
204 /* accepted sockets need our listen data ready undone */ 221 /* accepted sockets need our listen data ready undone */
@@ -219,6 +236,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
219 write_unlock_bh(&sock->sk->sk_callback_lock); 236 write_unlock_bh(&sock->sk->sk_callback_lock);
220} 237}
221 238
239/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4
240 * connections for backward compatibility.
241 */
222static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, 242static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
223 struct rds_info_iterator *iter, 243 struct rds_info_iterator *iter,
224 struct rds_info_lengths *lens) 244 struct rds_info_lengths *lens)
@@ -226,8 +246,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
226 struct rds_info_tcp_socket tsinfo; 246 struct rds_info_tcp_socket tsinfo;
227 struct rds_tcp_connection *tc; 247 struct rds_tcp_connection *tc;
228 unsigned long flags; 248 unsigned long flags;
229 struct sockaddr_in sin;
230 struct socket *sock;
231 249
232 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); 250 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
233 251
@@ -235,16 +253,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
235 goto out; 253 goto out;
236 254
237 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { 255 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
256 struct inet_sock *inet = inet_sk(tc->t_sock->sk);
238 257
239 sock = tc->t_sock; 258 if (tc->t_cpath->cp_conn->c_isv6)
240 if (sock) { 259 continue;
241 sock->ops->getname(sock, (struct sockaddr *)&sin, 0); 260
242 tsinfo.local_addr = sin.sin_addr.s_addr; 261 tsinfo.local_addr = inet->inet_saddr;
243 tsinfo.local_port = sin.sin_port; 262 tsinfo.local_port = inet->inet_sport;
244 sock->ops->getname(sock, (struct sockaddr *)&sin, 1); 263 tsinfo.peer_addr = inet->inet_daddr;
245 tsinfo.peer_addr = sin.sin_addr.s_addr; 264 tsinfo.peer_port = inet->inet_dport;
246 tsinfo.peer_port = sin.sin_port;
247 }
248 265
249 tsinfo.hdr_rem = tc->t_tinc_hdr_rem; 266 tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
250 tsinfo.data_rem = tc->t_tinc_data_rem; 267 tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -262,10 +279,82 @@ out:
262 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); 279 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
263} 280}
264 281
265static int rds_tcp_laddr_check(struct net *net, __be32 addr) 282#if IS_ENABLED(CONFIG_IPV6)
283/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
284 * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
285 * address.
286 */
287static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
288 struct rds_info_iterator *iter,
289 struct rds_info_lengths *lens)
266{ 290{
267 if (inet_addr_type(net, addr) == RTN_LOCAL) 291 struct rds6_info_tcp_socket tsinfo6;
292 struct rds_tcp_connection *tc;
293 unsigned long flags;
294
295 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
296
297 if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
298 goto out;
299
300 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
301 struct sock *sk = tc->t_sock->sk;
302 struct inet_sock *inet = inet_sk(sk);
303
304 tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
305 tsinfo6.local_port = inet->inet_sport;
306 tsinfo6.peer_addr = sk->sk_v6_daddr;
307 tsinfo6.peer_port = inet->inet_dport;
308
309 tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
310 tsinfo6.data_rem = tc->t_tinc_data_rem;
311 tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
312 tsinfo6.last_expected_una = tc->t_last_expected_una;
313 tsinfo6.last_seen_una = tc->t_last_seen_una;
314
315 rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
316 }
317
318out:
319 lens->nr = rds6_tcp_tc_count;
320 lens->each = sizeof(tsinfo6);
321
322 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
323}
324#endif
325
326static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
327 __u32 scope_id)
328{
329 struct net_device *dev = NULL;
330#if IS_ENABLED(CONFIG_IPV6)
331 int ret;
332#endif
333
334 if (ipv6_addr_v4mapped(addr)) {
335 if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
336 return 0;
337 return -EADDRNOTAVAIL;
338 }
339
340 /* If the scope_id is specified, check only those addresses
341 * hosted on the specified interface.
342 */
343 if (scope_id != 0) {
344 rcu_read_lock();
345 dev = dev_get_by_index_rcu(net, scope_id);
346 /* scope_id is not valid... */
347 if (!dev) {
348 rcu_read_unlock();
349 return -EADDRNOTAVAIL;
350 }
351 rcu_read_unlock();
352 }
353#if IS_ENABLED(CONFIG_IPV6)
354 ret = ipv6_chk_addr(net, addr, dev, 0);
355 if (ret)
268 return 0; 356 return 0;
357#endif
269 return -EADDRNOTAVAIL; 358 return -EADDRNOTAVAIL;
270} 359}
271 360
@@ -468,13 +557,27 @@ static __net_init int rds_tcp_init_net(struct net *net)
468 err = -ENOMEM; 557 err = -ENOMEM;
469 goto fail; 558 goto fail;
470 } 559 }
471 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); 560
561#if IS_ENABLED(CONFIG_IPV6)
562 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
563#else
564 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
565#endif
472 if (!rtn->rds_tcp_listen_sock) { 566 if (!rtn->rds_tcp_listen_sock) {
473 pr_warn("could not set up listen sock\n"); 567 pr_warn("could not set up IPv6 listen sock\n");
474 unregister_net_sysctl_table(rtn->rds_tcp_sysctl); 568
475 rtn->rds_tcp_sysctl = NULL; 569#if IS_ENABLED(CONFIG_IPV6)
476 err = -EAFNOSUPPORT; 570 /* Try IPv4 as some systems disable IPv6 */
477 goto fail; 571 rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
572 if (!rtn->rds_tcp_listen_sock) {
573#endif
574 unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
575 rtn->rds_tcp_sysctl = NULL;
576 err = -EAFNOSUPPORT;
577 goto fail;
578#if IS_ENABLED(CONFIG_IPV6)
579 }
580#endif
478 } 581 }
479 INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); 582 INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
480 return 0; 583 return 0;
@@ -588,6 +691,9 @@ static void rds_tcp_exit(void)
588 rds_tcp_set_unloading(); 691 rds_tcp_set_unloading();
589 synchronize_rcu(); 692 synchronize_rcu();
590 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 693 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
694#if IS_ENABLED(CONFIG_IPV6)
695 rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
696#endif
591 unregister_pernet_device(&rds_tcp_net_ops); 697 unregister_pernet_device(&rds_tcp_net_ops);
592 rds_tcp_destroy_conns(); 698 rds_tcp_destroy_conns();
593 rds_trans_unregister(&rds_tcp_transport); 699 rds_trans_unregister(&rds_tcp_transport);
@@ -619,6 +725,9 @@ static int rds_tcp_init(void)
619 rds_trans_register(&rds_tcp_transport); 725 rds_trans_register(&rds_tcp_transport);
620 726
621 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 727 rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
728#if IS_ENABLED(CONFIG_IPV6)
729 rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
730#endif
622 731
623 goto out; 732 goto out;
624out_recv: 733out_recv:
@@ -633,4 +742,3 @@ module_init(rds_tcp_init);
633MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 742MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
634MODULE_DESCRIPTION("RDS: TCP transport"); 743MODULE_DESCRIPTION("RDS: TCP transport");
635MODULE_LICENSE("Dual BSD/GPL"); 744MODULE_LICENSE("Dual BSD/GPL");
636
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index c6fa080e9b6d..3c69361d21c7 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -67,7 +67,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
67void rds_tcp_state_change(struct sock *sk); 67void rds_tcp_state_change(struct sock *sk);
68 68
69/* tcp_listen.c */ 69/* tcp_listen.c */
70struct socket *rds_tcp_listen_init(struct net *); 70struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
71void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); 71void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
72void rds_tcp_listen_data_ready(struct sock *sk); 72void rds_tcp_listen_data_ready(struct sock *sk);
73int rds_tcp_accept_one(struct socket *sock); 73int rds_tcp_accept_one(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index d999e7075645..008f50fb25dd 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk)
66 * RDS connection as RDS_CONN_UP until the reconnect, 66 * RDS connection as RDS_CONN_UP until the reconnect,
67 * to avoid RDS datagram loss. 67 * to avoid RDS datagram loss.
68 */ 68 */
69 if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) && 69 if (rds_addr_cmp(&cp->cp_conn->c_laddr,
70 &cp->cp_conn->c_faddr) >= 0 &&
70 rds_conn_path_transition(cp, RDS_CONN_CONNECTING, 71 rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
71 RDS_CONN_ERROR)) { 72 RDS_CONN_ERROR)) {
72 rds_conn_path_drop(cp, false); 73 rds_conn_path_drop(cp, false);
@@ -88,7 +89,11 @@ out:
88int rds_tcp_conn_path_connect(struct rds_conn_path *cp) 89int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
89{ 90{
90 struct socket *sock = NULL; 91 struct socket *sock = NULL;
91 struct sockaddr_in src, dest; 92 struct sockaddr_in6 sin6;
93 struct sockaddr_in sin;
94 struct sockaddr *addr;
95 int addrlen;
96 bool isv6;
92 int ret; 97 int ret;
93 struct rds_connection *conn = cp->cp_conn; 98 struct rds_connection *conn = cp->cp_conn;
94 struct rds_tcp_connection *tc = cp->cp_transport_data; 99 struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -105,37 +110,68 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
105 mutex_unlock(&tc->t_conn_path_lock); 110 mutex_unlock(&tc->t_conn_path_lock);
106 return 0; 111 return 0;
107 } 112 }
108 ret = sock_create_kern(rds_conn_net(conn), PF_INET, 113 if (ipv6_addr_v4mapped(&conn->c_laddr)) {
109 SOCK_STREAM, IPPROTO_TCP, &sock); 114 ret = sock_create_kern(rds_conn_net(conn), PF_INET,
115 SOCK_STREAM, IPPROTO_TCP, &sock);
116 isv6 = false;
117 } else {
118 ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
119 SOCK_STREAM, IPPROTO_TCP, &sock);
120 isv6 = true;
121 }
122
110 if (ret < 0) 123 if (ret < 0)
111 goto out; 124 goto out;
112 125
113 rds_tcp_tune(sock); 126 rds_tcp_tune(sock);
114 127
115 src.sin_family = AF_INET; 128 if (isv6) {
116 src.sin_addr.s_addr = (__force u32)conn->c_laddr; 129 sin6.sin6_family = AF_INET6;
117 src.sin_port = (__force u16)htons(0); 130 sin6.sin6_addr = conn->c_laddr;
131 sin6.sin6_port = 0;
132 sin6.sin6_flowinfo = 0;
133 sin6.sin6_scope_id = conn->c_dev_if;
134 addr = (struct sockaddr *)&sin6;
135 addrlen = sizeof(sin6);
136 } else {
137 sin.sin_family = AF_INET;
138 sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
139 sin.sin_port = 0;
140 addr = (struct sockaddr *)&sin;
141 addrlen = sizeof(sin);
142 }
118 143
119 ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); 144 ret = sock->ops->bind(sock, addr, addrlen);
120 if (ret) { 145 if (ret) {
121 rdsdebug("bind failed with %d at address %pI4\n", 146 rdsdebug("bind failed with %d at address %pI6c\n",
122 ret, &conn->c_laddr); 147 ret, &conn->c_laddr);
123 goto out; 148 goto out;
124 } 149 }
125 150
126 dest.sin_family = AF_INET; 151 if (isv6) {
127 dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 152 sin6.sin6_family = AF_INET6;
128 dest.sin_port = (__force u16)htons(RDS_TCP_PORT); 153 sin6.sin6_addr = conn->c_faddr;
154 sin6.sin6_port = htons(RDS_TCP_PORT);
155 sin6.sin6_flowinfo = 0;
156 sin6.sin6_scope_id = conn->c_dev_if;
157 addr = (struct sockaddr *)&sin6;
158 addrlen = sizeof(sin6);
159 } else {
160 sin.sin_family = AF_INET;
161 sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
162 sin.sin_port = htons(RDS_TCP_PORT);
163 addr = (struct sockaddr *)&sin;
164 addrlen = sizeof(sin);
165 }
129 166
130 /* 167 /*
131 * once we call connect() we can start getting callbacks and they 168 * once we call connect() we can start getting callbacks and they
132 * own the socket 169 * own the socket
133 */ 170 */
134 rds_tcp_set_callbacks(sock, cp); 171 rds_tcp_set_callbacks(sock, cp);
135 ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), 172 ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK);
136 O_NONBLOCK);
137 173
138 rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); 174 rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
139 if (ret == -EINPROGRESS) 175 if (ret == -EINPROGRESS)
140 ret = 0; 176 ret = 0;
141 if (ret == 0) { 177 if (ret == 0) {
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 22571189f21e..c12203f646da 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006, 2018 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -83,13 +83,12 @@ static
83struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) 83struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
84{ 84{
85 int i; 85 int i;
86 bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr);
87 int npaths = max_t(int, 1, conn->c_npaths); 86 int npaths = max_t(int, 1, conn->c_npaths);
88 87
89 /* for mprds, all paths MUST be initiated by the peer 88 /* for mprds, all paths MUST be initiated by the peer
90 * with the smaller address. 89 * with the smaller address.
91 */ 90 */
92 if (!peer_is_smaller) { 91 if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
93 /* Make sure we initiate at least one path if this 92 /* Make sure we initiate at least one path if this
94 * has not already been done; rds_start_mprds() will 93 * has not already been done; rds_start_mprds() will
95 * take care of additional paths, if necessary. 94 * take care of additional paths, if necessary.
@@ -132,6 +131,11 @@ int rds_tcp_accept_one(struct socket *sock)
132 struct rds_tcp_connection *rs_tcp = NULL; 131 struct rds_tcp_connection *rs_tcp = NULL;
133 int conn_state; 132 int conn_state;
134 struct rds_conn_path *cp; 133 struct rds_conn_path *cp;
134 struct in6_addr *my_addr, *peer_addr;
135#if !IS_ENABLED(CONFIG_IPV6)
136 struct in6_addr saddr, daddr;
137#endif
138 int dev_if = 0;
135 139
136 if (!sock) /* module unload or netns delete in progress */ 140 if (!sock) /* module unload or netns delete in progress */
137 return -ENETUNREACH; 141 return -ENETUNREACH;
@@ -164,13 +168,40 @@ int rds_tcp_accept_one(struct socket *sock)
164 168
165 inet = inet_sk(new_sock->sk); 169 inet = inet_sk(new_sock->sk);
166 170
167 rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", 171#if IS_ENABLED(CONFIG_IPV6)
168 &inet->inet_saddr, ntohs(inet->inet_sport), 172 my_addr = &new_sock->sk->sk_v6_rcv_saddr;
169 &inet->inet_daddr, ntohs(inet->inet_dport)); 173 peer_addr = &new_sock->sk->sk_v6_daddr;
174#else
175 ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
176 ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
177 my_addr = &saddr;
178 peer_addr = &daddr;
179#endif
180 rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
181 sock->sk->sk_family,
182 my_addr, ntohs(inet->inet_sport),
183 peer_addr, ntohs(inet->inet_dport));
184
185#if IS_ENABLED(CONFIG_IPV6)
186 /* sk_bound_dev_if is not set if the peer address is not link local
187 * address. In this case, it happens that mcast_oif is set. So
188 * just use it.
189 */
190 if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) &&
191 !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) {
192 struct ipv6_pinfo *inet6;
193
194 inet6 = inet6_sk(new_sock->sk);
195 dev_if = inet6->mcast_oif;
196 } else {
197 dev_if = new_sock->sk->sk_bound_dev_if;
198 }
199#endif
170 200
171 conn = rds_conn_create(sock_net(sock->sk), 201 conn = rds_conn_create(sock_net(sock->sk),
172 inet->inet_saddr, inet->inet_daddr, 202 my_addr, peer_addr,
173 &rds_tcp_transport, GFP_KERNEL); 203 &rds_tcp_transport, GFP_KERNEL, dev_if);
204
174 if (IS_ERR(conn)) { 205 if (IS_ERR(conn)) {
175 ret = PTR_ERR(conn); 206 ret = PTR_ERR(conn);
176 goto out; 207 goto out;
@@ -254,15 +285,22 @@ out:
254 ready(sk); 285 ready(sk);
255} 286}
256 287
257struct socket *rds_tcp_listen_init(struct net *net) 288struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
258{ 289{
259 struct sockaddr_in sin;
260 struct socket *sock = NULL; 290 struct socket *sock = NULL;
291 struct sockaddr_storage ss;
292 struct sockaddr_in6 *sin6;
293 struct sockaddr_in *sin;
294 int addr_len;
261 int ret; 295 int ret;
262 296
263 ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 297 ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
264 if (ret < 0) 298 IPPROTO_TCP, &sock);
299 if (ret < 0) {
300 rdsdebug("could not create %s listener socket: %d\n",
301 isv6 ? "IPv6" : "IPv4", ret);
265 goto out; 302 goto out;
303 }
266 304
267 sock->sk->sk_reuse = SK_CAN_REUSE; 305 sock->sk->sk_reuse = SK_CAN_REUSE;
268 rds_tcp_nonagle(sock); 306 rds_tcp_nonagle(sock);
@@ -272,13 +310,28 @@ struct socket *rds_tcp_listen_init(struct net *net)
272 sock->sk->sk_data_ready = rds_tcp_listen_data_ready; 310 sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
273 write_unlock_bh(&sock->sk->sk_callback_lock); 311 write_unlock_bh(&sock->sk->sk_callback_lock);
274 312
275 sin.sin_family = PF_INET; 313 if (isv6) {
276 sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); 314 sin6 = (struct sockaddr_in6 *)&ss;
277 sin.sin_port = (__force u16)htons(RDS_TCP_PORT); 315 sin6->sin6_family = PF_INET6;
316 sin6->sin6_addr = in6addr_any;
317 sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
318 sin6->sin6_scope_id = 0;
319 sin6->sin6_flowinfo = 0;
320 addr_len = sizeof(*sin6);
321 } else {
322 sin = (struct sockaddr_in *)&ss;
323 sin->sin_family = PF_INET;
324 sin->sin_addr.s_addr = INADDR_ANY;
325 sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
326 addr_len = sizeof(*sin);
327 }
278 328
279 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 329 ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len);
280 if (ret < 0) 330 if (ret < 0) {
331 rdsdebug("could not bind %s listener socket: %d\n",
332 isv6 ? "IPv6" : "IPv4", ret);
281 goto out; 333 goto out;
334 }
282 335
283 ret = sock->ops->listen(sock, 64); 336 ret = sock->ops->listen(sock, 64);
284 if (ret < 0) 337 if (ret < 0)
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index b9fbd2ee74ef..42c5ff1eda95 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
179 tc->t_tinc = tinc; 179 tc->t_tinc = tinc;
180 rdsdebug("alloced tinc %p\n", tinc); 180 rdsdebug("alloced tinc %p\n", tinc);
181 rds_inc_path_init(&tinc->ti_inc, cp, 181 rds_inc_path_init(&tinc->ti_inc, cp,
182 cp->cp_conn->c_faddr); 182 &cp->cp_conn->c_faddr);
183 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = 183 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
184 local_clock(); 184 local_clock();
185 185
@@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
239 if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 239 if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
240 rds_tcp_cong_recv(conn, tinc); 240 rds_tcp_cong_recv(conn, tinc);
241 else 241 else
242 rds_recv_incoming(conn, conn->c_faddr, 242 rds_recv_incoming(conn, &conn->c_faddr,
243 conn->c_laddr, &tinc->ti_inc, 243 &conn->c_laddr,
244 &tinc->ti_inc,
244 arg->gfp); 245 arg->gfp);
245 246
246 tc->t_tinc_hdr_rem = sizeof(struct rds_header); 247 tc->t_tinc_hdr_rem = sizeof(struct rds_header);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7df869d37afd..78a2554a4497 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -153,7 +153,7 @@ out:
153 * an incoming RST. 153 * an incoming RST.
154 */ 154 */
155 if (rds_conn_path_up(cp)) { 155 if (rds_conn_path_up(cp)) {
156 pr_warn("RDS/tcp: send to %pI4 on cp [%d]" 156 pr_warn("RDS/tcp: send to %pI6c on cp [%d]"
157 "returned %d, " 157 "returned %d, "
158 "disconnecting and reconnecting\n", 158 "disconnecting and reconnecting\n",
159 &conn->c_faddr, cp->cp_index, ret); 159 &conn->c_faddr, cp->cp_index, ret);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index c52861d77a59..e64f9e4c3cda 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
82 return; 82 return;
83 } 83 }
84 84
85 rdsdebug("conn %p for %pI4 to %pI4 complete\n", 85 rdsdebug("conn %p for %pI6c to %pI6c complete\n",
86 cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); 86 cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
87 87
88 cp->cp_reconnect_jiffies = 0; 88 cp->cp_reconnect_jiffies = 0;
89 set_bit(0, &cp->cp_conn->c_map_queued); 89 set_bit(0, &cp->cp_conn->c_map_queued);
@@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
125 unsigned long rand; 125 unsigned long rand;
126 struct rds_connection *conn = cp->cp_conn; 126 struct rds_connection *conn = cp->cp_conn;
127 127
128 rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", 128 rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
129 conn, &conn->c_laddr, &conn->c_faddr, 129 conn, &conn->c_laddr, &conn->c_faddr,
130 cp->cp_reconnect_jiffies); 130 cp->cp_reconnect_jiffies);
131 131
132 /* let peer with smaller addr initiate reconnect, to avoid duels */ 132 /* let peer with smaller addr initiate reconnect, to avoid duels */
133 if (conn->c_trans->t_type == RDS_TRANS_TCP && 133 if (conn->c_trans->t_type == RDS_TRANS_TCP &&
134 !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) 134 rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
135 return; 135 return;
136 136
137 set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 137 set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
@@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
145 } 145 }
146 146
147 get_random_bytes(&rand, sizeof(rand)); 147 get_random_bytes(&rand, sizeof(rand));
148 rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", 148 rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
149 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, 149 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
150 conn, &conn->c_laddr, &conn->c_faddr); 150 conn, &conn->c_laddr, &conn->c_faddr);
151 rcu_read_lock(); 151 rcu_read_lock();
@@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work)
167 int ret; 167 int ret;
168 168
169 if (cp->cp_index > 0 && 169 if (cp->cp_index > 0 &&
170 !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) 170 rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
171 return; 171 return;
172 clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 172 clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
173 ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); 173 ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
174 if (ret) { 174 if (ret) {
175 ret = conn->c_trans->conn_path_connect(cp); 175 ret = conn->c_trans->conn_path_connect(cp);
176 rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", 176 rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
177 conn, &conn->c_laddr, &conn->c_faddr, ret); 177 conn, &conn->c_laddr, &conn->c_faddr, ret);
178 178
179 if (ret) { 179 if (ret) {
180 if (rds_conn_path_transition(cp, 180 if (rds_conn_path_transition(cp,
@@ -259,3 +259,50 @@ int rds_threads_init(void)
259 259
260 return 0; 260 return 0;
261} 261}
262
263/* Compare two IPv6 addresses. Return 0 if the two addresses are equal.
264 * Return 1 if the first is greater. Return -1 if the second is greater.
265 */
266int rds_addr_cmp(const struct in6_addr *addr1,
267 const struct in6_addr *addr2)
268{
269#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
270 const __be64 *a1, *a2;
271 u64 x, y;
272
273 a1 = (__be64 *)addr1;
274 a2 = (__be64 *)addr2;
275
276 if (*a1 != *a2) {
277 if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
278 return -1;
279 else
280 return 1;
281 } else {
282 x = be64_to_cpu(*++a1);
283 y = be64_to_cpu(*++a2);
284 if (x < y)
285 return -1;
286 else if (x > y)
287 return 1;
288 else
289 return 0;
290 }
291#else
292 u32 a, b;
293 int i;
294
295 for (i = 0; i < 4; i++) {
296 if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
297 a = ntohl(addr1->s6_addr32[i]);
298 b = ntohl(addr2->s6_addr32[i]);
299 if (a < b)
300 return -1;
301 else if (a > b)
302 return 1;
303 }
304 }
305 return 0;
306#endif
307}
308EXPORT_SYMBOL_GPL(rds_addr_cmp);
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 0b188dd0a344..46f709a4b577 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2006 Oracle. All rights reserved. 2 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
3 * 3 *
4 * This software is available to you under a choice of one of two 4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU 5 * licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/module.h> 34#include <linux/module.h>
35#include <linux/in.h> 35#include <linux/in.h>
36#include <linux/ipv6.h>
36 37
37#include "rds.h" 38#include "rds.h"
38#include "loop.h" 39#include "loop.h"
@@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans)
75 module_put(trans->t_owner); 76 module_put(trans->t_owner);
76} 77}
77 78
78struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) 79struct rds_transport *rds_trans_get_preferred(struct net *net,
80 const struct in6_addr *addr,
81 __u32 scope_id)
79{ 82{
80 struct rds_transport *ret = NULL; 83 struct rds_transport *ret = NULL;
81 struct rds_transport *trans; 84 struct rds_transport *trans;
82 unsigned int i; 85 unsigned int i;
83 86
84 if (IN_LOOPBACK(ntohl(addr))) 87 if (ipv6_addr_v4mapped(addr)) {
88 if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET)
89 return &rds_loop_transport;
90 } else if (ipv6_addr_loopback(addr)) {
85 return &rds_loop_transport; 91 return &rds_loop_transport;
92 }
86 93
87 down_read(&rds_trans_sem); 94 down_read(&rds_trans_sem);
88 for (i = 0; i < RDS_TRANS_COUNT; i++) { 95 for (i = 0; i < RDS_TRANS_COUNT; i++) {
89 trans = transports[i]; 96 trans = transports[i];
90 97
91 if (trans && (trans->laddr_check(net, addr) == 0) && 98 if (trans && (trans->laddr_check(net, addr, scope_id) == 0) &&
92 (!trans->t_owner || try_module_get(trans->t_owner))) { 99 (!trans->t_owner || try_module_get(trans->t_owner))) {
93 ret = trans; 100 ret = trans;
94 break; 101 break;
@@ -152,4 +159,3 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
152 159
153 return total; 160 return total;
154} 161}
155