aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-03-30 17:53:32 -0400
committerIngo Molnar <mingo@elte.hu>2009-03-30 17:53:32 -0400
commit65fb0d23fcddd8697c871047b700c78817bdaa43 (patch)
tree119e6e5f276622c4c862f6c9b6d795264ba1603a /net/rds
parent8c083f081d0014057901c68a0a3e0f8ca7ac8d23 (diff)
parentdfbbe89e197a77f2c8046a51c74e33e35f878080 (diff)
Merge branch 'linus' into cpumask-for-linus
Conflicts: arch/x86/kernel/cpu/common.c
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig14
-rw-r--r--net/rds/Makefile14
-rw-r--r--net/rds/af_rds.c586
-rw-r--r--net/rds/bind.c199
-rw-r--r--net/rds/cong.c404
-rw-r--r--net/rds/connection.c487
-rw-r--r--net/rds/ib.c323
-rw-r--r--net/rds/ib.h367
-rw-r--r--net/rds/ib_cm.c726
-rw-r--r--net/rds/ib_rdma.c641
-rw-r--r--net/rds/ib_recv.c869
-rw-r--r--net/rds/ib_ring.c168
-rw-r--r--net/rds/ib_send.c874
-rw-r--r--net/rds/ib_stats.c95
-rw-r--r--net/rds/ib_sysctl.c137
-rw-r--r--net/rds/info.c241
-rw-r--r--net/rds/info.h30
-rw-r--r--net/rds/iw.c333
-rw-r--r--net/rds/iw.h395
-rw-r--r--net/rds/iw_cm.c750
-rw-r--r--net/rds/iw_rdma.c888
-rw-r--r--net/rds/iw_recv.c869
-rw-r--r--net/rds/iw_ring.c169
-rw-r--r--net/rds/iw_send.c975
-rw-r--r--net/rds/iw_stats.c95
-rw-r--r--net/rds/iw_sysctl.c137
-rw-r--r--net/rds/loop.c188
-rw-r--r--net/rds/loop.h9
-rw-r--r--net/rds/message.c402
-rw-r--r--net/rds/page.c221
-rw-r--r--net/rds/rdma.c679
-rw-r--r--net/rds/rdma.h84
-rw-r--r--net/rds/rdma_transport.c214
-rw-r--r--net/rds/rdma_transport.h28
-rw-r--r--net/rds/rds.h686
-rw-r--r--net/rds/recv.c542
-rw-r--r--net/rds/send.c1003
-rw-r--r--net/rds/stats.c148
-rw-r--r--net/rds/sysctl.c122
-rw-r--r--net/rds/threads.c265
-rw-r--r--net/rds/transport.c117
41 files changed, 15494 insertions, 0 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 000000000000..796773b5df9b
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,14 @@
1
2config RDS
3 tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
4 depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
5 depends on INFINIBAND && INFINIBAND_ADDR_TRANS
6 ---help---
7 RDS provides reliable, sequenced delivery of datagrams
8 over Infiniband.
9
10config RDS_DEBUG
11 bool "Debugging messages"
12 depends on RDS
13 default n
14
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 000000000000..51f27585fa08
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,14 @@
1obj-$(CONFIG_RDS) += rds.o
2rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
3 recv.o send.o stats.o sysctl.o threads.o transport.o \
4 loop.o page.o rdma.o \
5 rdma_transport.o \
6 ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
7 ib_sysctl.o ib_rdma.o \
8 iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
9 iw_sysctl.o iw_rdma.o
10
11ifeq ($(CONFIG_RDS_DEBUG), y)
12EXTRA_CFLAGS += -DDEBUG
13endif
14
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 000000000000..20cf16fc572f
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/module.h>
34#include <linux/errno.h>
35#include <linux/kernel.h>
36#include <linux/in.h>
37#include <linux/poll.h>
38#include <linux/version.h>
39#include <net/sock.h>
40
41#include "rds.h"
42#include "rdma.h"
43#include "rdma_transport.h"
44
45/* this is just used for stats gathering :/ */
46static DEFINE_SPINLOCK(rds_sock_lock);
47static unsigned long rds_sock_count;
48static LIST_HEAD(rds_sock_list);
49DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
50
51/*
52 * This is called as the final descriptor referencing this socket is closed.
53 * We have to unbind the socket so that another socket can be bound to the
54 * address it was using.
55 *
56 * We have to be careful about racing with the incoming path. sock_orphan()
57 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
58 * messages shouldn't be queued.
59 */
60static int rds_release(struct socket *sock)
61{
62 struct sock *sk = sock->sk;
63 struct rds_sock *rs;
64 unsigned long flags;
65
66 if (sk == NULL)
67 goto out;
68
69 rs = rds_sk_to_rs(sk);
70
71 sock_orphan(sk);
72 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
73 * that ensures the recv path has completed messing
74 * with the socket. */
75 rds_clear_recv_queue(rs);
76 rds_cong_remove_socket(rs);
77 rds_remove_bound(rs);
78 rds_send_drop_to(rs, NULL);
79 rds_rdma_drop_keys(rs);
80 rds_notify_queue_get(rs, NULL);
81
82 spin_lock_irqsave(&rds_sock_lock, flags);
83 list_del_init(&rs->rs_item);
84 rds_sock_count--;
85 spin_unlock_irqrestore(&rds_sock_lock, flags);
86
87 sock->sk = NULL;
88 sock_put(sk);
89out:
90 return 0;
91}
92
93/*
94 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
95 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
96 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
97 * this seems more conservative.
98 * NB - normally, one would use sk_callback_lock for this, but we can
99 * get here from interrupts, whereas the network code grabs sk_callback_lock
100 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
101 */
102void rds_wake_sk_sleep(struct rds_sock *rs)
103{
104 unsigned long flags;
105
106 read_lock_irqsave(&rs->rs_recv_lock, flags);
107 __rds_wake_sk_sleep(rds_rs_to_sk(rs));
108 read_unlock_irqrestore(&rs->rs_recv_lock, flags);
109}
110
111static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
112 int *uaddr_len, int peer)
113{
114 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
115 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
116
117 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
118
119 /* racey, don't care */
120 if (peer) {
121 if (!rs->rs_conn_addr)
122 return -ENOTCONN;
123
124 sin->sin_port = rs->rs_conn_port;
125 sin->sin_addr.s_addr = rs->rs_conn_addr;
126 } else {
127 sin->sin_port = rs->rs_bound_port;
128 sin->sin_addr.s_addr = rs->rs_bound_addr;
129 }
130
131 sin->sin_family = AF_INET;
132
133 *uaddr_len = sizeof(*sin);
134 return 0;
135}
136
137/*
138 * RDS' poll is without a doubt the least intuitive part of the interface,
139 * as POLLIN and POLLOUT do not behave entirely as you would expect from
140 * a network protocol.
141 *
142 * POLLIN is asserted if
143 * - there is data on the receive queue.
144 * - to signal that a previously congested destination may have become
145 * uncongested
146 * - A notification has been queued to the socket (this can be a congestion
147 * update, or a RDMA completion).
148 *
149 * POLLOUT is asserted if there is room on the send queue. This does not mean
150 * however, that the next sendmsg() call will succeed. If the application tries
151 * to send to a congested destination, the system call may still fail (and
152 * return ENOBUFS).
153 */
154static unsigned int rds_poll(struct file *file, struct socket *sock,
155 poll_table *wait)
156{
157 struct sock *sk = sock->sk;
158 struct rds_sock *rs = rds_sk_to_rs(sk);
159 unsigned int mask = 0;
160 unsigned long flags;
161
162 poll_wait(file, sk->sk_sleep, wait);
163
164 poll_wait(file, &rds_poll_waitq, wait);
165
166 read_lock_irqsave(&rs->rs_recv_lock, flags);
167 if (!rs->rs_cong_monitor) {
168 /* When a congestion map was updated, we signal POLLIN for
169 * "historical" reasons. Applications can also poll for
170 * WRBAND instead. */
171 if (rds_cong_updated_since(&rs->rs_cong_track))
172 mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
173 } else {
174 spin_lock(&rs->rs_lock);
175 if (rs->rs_cong_notify)
176 mask |= (POLLIN | POLLRDNORM);
177 spin_unlock(&rs->rs_lock);
178 }
179 if (!list_empty(&rs->rs_recv_queue)
180 || !list_empty(&rs->rs_notify_queue))
181 mask |= (POLLIN | POLLRDNORM);
182 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
183 mask |= (POLLOUT | POLLWRNORM);
184 read_unlock_irqrestore(&rs->rs_recv_lock, flags);
185
186 return mask;
187}
188
189static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
190{
191 return -ENOIOCTLCMD;
192}
193
194static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
195 int len)
196{
197 struct sockaddr_in sin;
198 int ret = 0;
199
200 /* racing with another thread binding seems ok here */
201 if (rs->rs_bound_addr == 0) {
202 ret = -ENOTCONN; /* XXX not a great errno */
203 goto out;
204 }
205
206 if (len < sizeof(struct sockaddr_in)) {
207 ret = -EINVAL;
208 goto out;
209 }
210
211 if (copy_from_user(&sin, optval, sizeof(sin))) {
212 ret = -EFAULT;
213 goto out;
214 }
215
216 rds_send_drop_to(rs, &sin);
217out:
218 return ret;
219}
220
221static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
222 int optlen)
223{
224 int value;
225
226 if (optlen < sizeof(int))
227 return -EINVAL;
228 if (get_user(value, (int __user *) optval))
229 return -EFAULT;
230 *optvar = !!value;
231 return 0;
232}
233
234static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
235 int optlen)
236{
237 int ret;
238
239 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
240 if (ret == 0) {
241 if (rs->rs_cong_monitor) {
242 rds_cong_add_socket(rs);
243 } else {
244 rds_cong_remove_socket(rs);
245 rs->rs_cong_mask = 0;
246 rs->rs_cong_notify = 0;
247 }
248 }
249 return ret;
250}
251
252static int rds_setsockopt(struct socket *sock, int level, int optname,
253 char __user *optval, int optlen)
254{
255 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
256 int ret;
257
258 if (level != SOL_RDS) {
259 ret = -ENOPROTOOPT;
260 goto out;
261 }
262
263 switch (optname) {
264 case RDS_CANCEL_SENT_TO:
265 ret = rds_cancel_sent_to(rs, optval, optlen);
266 break;
267 case RDS_GET_MR:
268 ret = rds_get_mr(rs, optval, optlen);
269 break;
270 case RDS_FREE_MR:
271 ret = rds_free_mr(rs, optval, optlen);
272 break;
273 case RDS_RECVERR:
274 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
275 break;
276 case RDS_CONG_MONITOR:
277 ret = rds_cong_monitor(rs, optval, optlen);
278 break;
279 default:
280 ret = -ENOPROTOOPT;
281 }
282out:
283 return ret;
284}
285
286static int rds_getsockopt(struct socket *sock, int level, int optname,
287 char __user *optval, int __user *optlen)
288{
289 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
290 int ret = -ENOPROTOOPT, len;
291
292 if (level != SOL_RDS)
293 goto out;
294
295 if (get_user(len, optlen)) {
296 ret = -EFAULT;
297 goto out;
298 }
299
300 switch (optname) {
301 case RDS_INFO_FIRST ... RDS_INFO_LAST:
302 ret = rds_info_getsockopt(sock, optname, optval,
303 optlen);
304 break;
305
306 case RDS_RECVERR:
307 if (len < sizeof(int))
308 ret = -EINVAL;
309 else
310 if (put_user(rs->rs_recverr, (int __user *) optval)
311 || put_user(sizeof(int), optlen))
312 ret = -EFAULT;
313 else
314 ret = 0;
315 break;
316 default:
317 break;
318 }
319
320out:
321 return ret;
322
323}
324
325static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
326 int addr_len, int flags)
327{
328 struct sock *sk = sock->sk;
329 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
330 struct rds_sock *rs = rds_sk_to_rs(sk);
331 int ret = 0;
332
333 lock_sock(sk);
334
335 if (addr_len != sizeof(struct sockaddr_in)) {
336 ret = -EINVAL;
337 goto out;
338 }
339
340 if (sin->sin_family != AF_INET) {
341 ret = -EAFNOSUPPORT;
342 goto out;
343 }
344
345 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
346 ret = -EDESTADDRREQ;
347 goto out;
348 }
349
350 rs->rs_conn_addr = sin->sin_addr.s_addr;
351 rs->rs_conn_port = sin->sin_port;
352
353out:
354 release_sock(sk);
355 return ret;
356}
357
358static struct proto rds_proto = {
359 .name = "RDS",
360 .owner = THIS_MODULE,
361 .obj_size = sizeof(struct rds_sock),
362};
363
364static struct proto_ops rds_proto_ops = {
365 .family = AF_RDS,
366 .owner = THIS_MODULE,
367 .release = rds_release,
368 .bind = rds_bind,
369 .connect = rds_connect,
370 .socketpair = sock_no_socketpair,
371 .accept = sock_no_accept,
372 .getname = rds_getname,
373 .poll = rds_poll,
374 .ioctl = rds_ioctl,
375 .listen = sock_no_listen,
376 .shutdown = sock_no_shutdown,
377 .setsockopt = rds_setsockopt,
378 .getsockopt = rds_getsockopt,
379 .sendmsg = rds_sendmsg,
380 .recvmsg = rds_recvmsg,
381 .mmap = sock_no_mmap,
382 .sendpage = sock_no_sendpage,
383};
384
385static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
386{
387 unsigned long flags;
388 struct rds_sock *rs;
389
390 sock_init_data(sock, sk);
391 sock->ops = &rds_proto_ops;
392 sk->sk_protocol = protocol;
393
394 rs = rds_sk_to_rs(sk);
395 spin_lock_init(&rs->rs_lock);
396 rwlock_init(&rs->rs_recv_lock);
397 INIT_LIST_HEAD(&rs->rs_send_queue);
398 INIT_LIST_HEAD(&rs->rs_recv_queue);
399 INIT_LIST_HEAD(&rs->rs_notify_queue);
400 INIT_LIST_HEAD(&rs->rs_cong_list);
401 spin_lock_init(&rs->rs_rdma_lock);
402 rs->rs_rdma_keys = RB_ROOT;
403
404 spin_lock_irqsave(&rds_sock_lock, flags);
405 list_add_tail(&rs->rs_item, &rds_sock_list);
406 rds_sock_count++;
407 spin_unlock_irqrestore(&rds_sock_lock, flags);
408
409 return 0;
410}
411
412static int rds_create(struct net *net, struct socket *sock, int protocol)
413{
414 struct sock *sk;
415
416 if (sock->type != SOCK_SEQPACKET || protocol)
417 return -ESOCKTNOSUPPORT;
418
419 sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
420 if (!sk)
421 return -ENOMEM;
422
423 return __rds_create(sock, sk, protocol);
424}
425
426void rds_sock_addref(struct rds_sock *rs)
427{
428 sock_hold(rds_rs_to_sk(rs));
429}
430
431void rds_sock_put(struct rds_sock *rs)
432{
433 sock_put(rds_rs_to_sk(rs));
434}
435
436static struct net_proto_family rds_family_ops = {
437 .family = AF_RDS,
438 .create = rds_create,
439 .owner = THIS_MODULE,
440};
441
442static void rds_sock_inc_info(struct socket *sock, unsigned int len,
443 struct rds_info_iterator *iter,
444 struct rds_info_lengths *lens)
445{
446 struct rds_sock *rs;
447 struct sock *sk;
448 struct rds_incoming *inc;
449 unsigned long flags;
450 unsigned int total = 0;
451
452 len /= sizeof(struct rds_info_message);
453
454 spin_lock_irqsave(&rds_sock_lock, flags);
455
456 list_for_each_entry(rs, &rds_sock_list, rs_item) {
457 sk = rds_rs_to_sk(rs);
458 read_lock(&rs->rs_recv_lock);
459
460 /* XXX too lazy to maintain counts.. */
461 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
462 total++;
463 if (total <= len)
464 rds_inc_info_copy(inc, iter, inc->i_saddr,
465 rs->rs_bound_addr, 1);
466 }
467
468 read_unlock(&rs->rs_recv_lock);
469 }
470
471 spin_unlock_irqrestore(&rds_sock_lock, flags);
472
473 lens->nr = total;
474 lens->each = sizeof(struct rds_info_message);
475}
476
477static void rds_sock_info(struct socket *sock, unsigned int len,
478 struct rds_info_iterator *iter,
479 struct rds_info_lengths *lens)
480{
481 struct rds_info_socket sinfo;
482 struct rds_sock *rs;
483 unsigned long flags;
484
485 len /= sizeof(struct rds_info_socket);
486
487 spin_lock_irqsave(&rds_sock_lock, flags);
488
489 if (len < rds_sock_count)
490 goto out;
491
492 list_for_each_entry(rs, &rds_sock_list, rs_item) {
493 sinfo.sndbuf = rds_sk_sndbuf(rs);
494 sinfo.rcvbuf = rds_sk_rcvbuf(rs);
495 sinfo.bound_addr = rs->rs_bound_addr;
496 sinfo.connected_addr = rs->rs_conn_addr;
497 sinfo.bound_port = rs->rs_bound_port;
498 sinfo.connected_port = rs->rs_conn_port;
499 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
500
501 rds_info_copy(iter, &sinfo, sizeof(sinfo));
502 }
503
504out:
505 lens->nr = rds_sock_count;
506 lens->each = sizeof(struct rds_info_socket);
507
508 spin_unlock_irqrestore(&rds_sock_lock, flags);
509}
510
511static void __exit rds_exit(void)
512{
513 rds_rdma_exit();
514 sock_unregister(rds_family_ops.family);
515 proto_unregister(&rds_proto);
516 rds_conn_exit();
517 rds_cong_exit();
518 rds_sysctl_exit();
519 rds_threads_exit();
520 rds_stats_exit();
521 rds_page_exit();
522 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
523 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
524}
525module_exit(rds_exit);
526
527static int __init rds_init(void)
528{
529 int ret;
530
531 ret = rds_conn_init();
532 if (ret)
533 goto out;
534 ret = rds_threads_init();
535 if (ret)
536 goto out_conn;
537 ret = rds_sysctl_init();
538 if (ret)
539 goto out_threads;
540 ret = rds_stats_init();
541 if (ret)
542 goto out_sysctl;
543 ret = proto_register(&rds_proto, 1);
544 if (ret)
545 goto out_stats;
546 ret = sock_register(&rds_family_ops);
547 if (ret)
548 goto out_proto;
549
550 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
551 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
552
553 /* ib/iwarp transports currently compiled-in */
554 ret = rds_rdma_init();
555 if (ret)
556 goto out_sock;
557 goto out;
558
559out_sock:
560 sock_unregister(rds_family_ops.family);
561out_proto:
562 proto_unregister(&rds_proto);
563out_stats:
564 rds_stats_exit();
565out_sysctl:
566 rds_sysctl_exit();
567out_threads:
568 rds_threads_exit();
569out_conn:
570 rds_conn_exit();
571 rds_cong_exit();
572 rds_page_exit();
573out:
574 return ret;
575}
576module_init(rds_init);
577
578#define DRV_VERSION "4.0"
579#define DRV_RELDATE "Feb 12, 2009"
580
581MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
582MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
583 " v" DRV_VERSION " (" DRV_RELDATE ")");
584MODULE_VERSION(DRV_VERSION);
585MODULE_LICENSE("Dual BSD/GPL");
586MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 000000000000..c17cc39160ce
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,199 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <net/sock.h>
35#include <linux/in.h>
36#include <linux/if_arp.h>
37#include "rds.h"
38
39/*
40 * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
41 * particularly zippy.
42 *
43 * This is now called for every incoming frame so we arguably care much more
44 * about it than we used to.
45 */
46static DEFINE_SPINLOCK(rds_bind_lock);
47static struct rb_root rds_bind_tree = RB_ROOT;
48
49static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
50 struct rds_sock *insert)
51{
52 struct rb_node **p = &rds_bind_tree.rb_node;
53 struct rb_node *parent = NULL;
54 struct rds_sock *rs;
55 u64 cmp;
56 u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
57
58 while (*p) {
59 parent = *p;
60 rs = rb_entry(parent, struct rds_sock, rs_bound_node);
61
62 cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
63 be16_to_cpu(rs->rs_bound_port);
64
65 if (needle < cmp)
66 p = &(*p)->rb_left;
67 else if (needle > cmp)
68 p = &(*p)->rb_right;
69 else
70 return rs;
71 }
72
73 if (insert) {
74 rb_link_node(&insert->rs_bound_node, parent, p);
75 rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
76 }
77 return NULL;
78}
79
80/*
81 * Return the rds_sock bound at the given local address.
82 *
83 * The rx path can race with rds_release. We notice if rds_release() has
84 * marked this socket and don't return a rs ref to the rx path.
85 */
86struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
87{
88 struct rds_sock *rs;
89 unsigned long flags;
90
91 spin_lock_irqsave(&rds_bind_lock, flags);
92 rs = rds_bind_tree_walk(addr, port, NULL);
93 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
94 rds_sock_addref(rs);
95 else
96 rs = NULL;
97 spin_unlock_irqrestore(&rds_bind_lock, flags);
98
99 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
100 ntohs(port));
101 return rs;
102}
103
104/* returns -ve errno or +ve port */
105static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
106{
107 unsigned long flags;
108 int ret = -EADDRINUSE;
109 u16 rover, last;
110
111 if (*port != 0) {
112 rover = be16_to_cpu(*port);
113 last = rover;
114 } else {
115 rover = max_t(u16, net_random(), 2);
116 last = rover - 1;
117 }
118
119 spin_lock_irqsave(&rds_bind_lock, flags);
120
121 do {
122 if (rover == 0)
123 rover++;
124 if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
125 *port = cpu_to_be16(rover);
126 ret = 0;
127 break;
128 }
129 } while (rover++ != last);
130
131 if (ret == 0) {
132 rs->rs_bound_addr = addr;
133 rs->rs_bound_port = *port;
134 rds_sock_addref(rs);
135
136 rdsdebug("rs %p binding to %pI4:%d\n",
137 rs, &addr, (int)ntohs(*port));
138 }
139
140 spin_unlock_irqrestore(&rds_bind_lock, flags);
141
142 return ret;
143}
144
145void rds_remove_bound(struct rds_sock *rs)
146{
147 unsigned long flags;
148
149 spin_lock_irqsave(&rds_bind_lock, flags);
150
151 if (rs->rs_bound_addr) {
152 rdsdebug("rs %p unbinding from %pI4:%d\n",
153 rs, &rs->rs_bound_addr,
154 ntohs(rs->rs_bound_port));
155
156 rb_erase(&rs->rs_bound_node, &rds_bind_tree);
157 rds_sock_put(rs);
158 rs->rs_bound_addr = 0;
159 }
160
161 spin_unlock_irqrestore(&rds_bind_lock, flags);
162}
163
164int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
165{
166 struct sock *sk = sock->sk;
167 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
168 struct rds_sock *rs = rds_sk_to_rs(sk);
169 struct rds_transport *trans;
170 int ret = 0;
171
172 lock_sock(sk);
173
174 if (addr_len != sizeof(struct sockaddr_in) ||
175 sin->sin_family != AF_INET ||
176 rs->rs_bound_addr ||
177 sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
178 ret = -EINVAL;
179 goto out;
180 }
181
182 ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
183 if (ret)
184 goto out;
185
186 trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
187 if (trans == NULL) {
188 ret = -EADDRNOTAVAIL;
189 rds_remove_bound(rs);
190 goto out;
191 }
192
193 rs->rs_transport = trans;
194 ret = 0;
195
196out:
197 release_sock(sk);
198 return ret;
199}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 000000000000..710e4599d76c
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,404 @@
1/*
2 * Copyright (c) 2007 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/types.h>
34#include <linux/rbtree.h>
35
36#include <asm-generic/bitops/le.h>
37
38#include "rds.h"
39
40/*
41 * This file implements the receive side of the unconventional congestion
42 * management in RDS.
43 *
44 * Messages waiting in the receive queue on the receiving socket are accounted
45 * against the sockets SO_RCVBUF option value. Only the payload bytes in the
46 * message are accounted for. If the number of bytes queued equals or exceeds
47 * rcvbuf then the socket is congested. All sends attempted to this socket's
48 * address should return block or return -EWOULDBLOCK.
49 *
50 * Applications are expected to be reasonably tuned such that this situation
51 * very rarely occurs. An application encountering this "back-pressure" is
52 * considered a bug.
53 *
54 * This is implemented by having each node maintain bitmaps which indicate
55 * which ports on bound addresses are congested. As the bitmap changes it is
56 * sent through all the connections which terminate in the local address of the
57 * bitmap which changed.
58 *
59 * The bitmaps are allocated as connections are brought up. This avoids
60 * allocation in the interrupt handling path which queues messages on sockets.
61 * The dense bitmaps let transports send the entire bitmap on any bitmap change
62 * reasonably efficiently. This is much easier to implement than some
63 * finer-grained communication of per-port congestion. The sender does a very
64 * inexpensive bit test to test if the port it's about to send to is congested
65 * or not.
66 */
67
68/*
69 * Interaction with poll is a tad tricky. We want all processes stuck in
70 * poll to wake up and check whether a congested destination became uncongested.
71 * The really sad thing is we have no idea which destinations the application
72 * wants to send to - we don't even know which rds_connections are involved.
73 * So until we implement a more flexible rds poll interface, we have to make
74 * do with this:
75 * We maintain a global counter that is incremented each time a congestion map
76 * update is received. Each rds socket tracks this value, and if rds_poll
77 * finds that the saved generation number is smaller than the global generation
78 * number, it wakes up the process.
79 */
80static atomic_t rds_cong_generation = ATOMIC_INIT(0);
81
82/*
83 * Congestion monitoring
84 */
85static LIST_HEAD(rds_cong_monitor);
86static DEFINE_RWLOCK(rds_cong_monitor_lock);
87
88/*
89 * Yes, a global lock. It's used so infrequently that it's worth keeping it
90 * global to simplify the locking. It's only used in the following
91 * circumstances:
92 *
93 * - on connection buildup to associate a conn with its maps
94 * - on map changes to inform conns of a new map to send
95 *
96 * It's sadly ordered under the socket callback lock and the connection lock.
97 * Receive paths can mark ports congested from interrupt context so the
98 * lock masks interrupts.
99 */
100static DEFINE_SPINLOCK(rds_cong_lock);
101static struct rb_root rds_cong_tree = RB_ROOT;
102
103static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
104 struct rds_cong_map *insert)
105{
106 struct rb_node **p = &rds_cong_tree.rb_node;
107 struct rb_node *parent = NULL;
108 struct rds_cong_map *map;
109
110 while (*p) {
111 parent = *p;
112 map = rb_entry(parent, struct rds_cong_map, m_rb_node);
113
114 if (addr < map->m_addr)
115 p = &(*p)->rb_left;
116 else if (addr > map->m_addr)
117 p = &(*p)->rb_right;
118 else
119 return map;
120 }
121
122 if (insert) {
123 rb_link_node(&insert->m_rb_node, parent, p);
124 rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
125 }
126 return NULL;
127}
128
129/*
130 * There is only ever one bitmap for any address. Connections try and allocate
131 * these bitmaps in the process getting pointers to them. The bitmaps are only
132 * ever freed as the module is removed after all connections have been freed.
133 */
134static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
135{
136 struct rds_cong_map *map;
137 struct rds_cong_map *ret = NULL;
138 unsigned long zp;
139 unsigned long i;
140 unsigned long flags;
141
142 map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
143 if (map == NULL)
144 return NULL;
145
146 map->m_addr = addr;
147 init_waitqueue_head(&map->m_waitq);
148 INIT_LIST_HEAD(&map->m_conn_list);
149
150 for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
151 zp = get_zeroed_page(GFP_KERNEL);
152 if (zp == 0)
153 goto out;
154 map->m_page_addrs[i] = zp;
155 }
156
157 spin_lock_irqsave(&rds_cong_lock, flags);
158 ret = rds_cong_tree_walk(addr, map);
159 spin_unlock_irqrestore(&rds_cong_lock, flags);
160
161 if (ret == NULL) {
162 ret = map;
163 map = NULL;
164 }
165
166out:
167 if (map) {
168 for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
169 free_page(map->m_page_addrs[i]);
170 kfree(map);
171 }
172
173 rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
174
175 return ret;
176}
177
178/*
179 * Put the conn on its local map's list. This is called when the conn is
180 * really added to the hash. It's nested under the rds_conn_lock, sadly.
181 */
182void rds_cong_add_conn(struct rds_connection *conn)
183{
184 unsigned long flags;
185
186 rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
187 spin_lock_irqsave(&rds_cong_lock, flags);
188 list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
189 spin_unlock_irqrestore(&rds_cong_lock, flags);
190}
191
192void rds_cong_remove_conn(struct rds_connection *conn)
193{
194 unsigned long flags;
195
196 rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
197 spin_lock_irqsave(&rds_cong_lock, flags);
198 list_del_init(&conn->c_map_item);
199 spin_unlock_irqrestore(&rds_cong_lock, flags);
200}
201
202int rds_cong_get_maps(struct rds_connection *conn)
203{
204 conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
205 conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
206
207 if (conn->c_lcong == NULL || conn->c_fcong == NULL)
208 return -ENOMEM;
209
210 return 0;
211}
212
213void rds_cong_queue_updates(struct rds_cong_map *map)
214{
215 struct rds_connection *conn;
216 unsigned long flags;
217
218 spin_lock_irqsave(&rds_cong_lock, flags);
219
220 list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
221 if (!test_and_set_bit(0, &conn->c_map_queued)) {
222 rds_stats_inc(s_cong_update_queued);
223 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
224 }
225 }
226
227 spin_unlock_irqrestore(&rds_cong_lock, flags);
228}
229
230void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
231{
232 rdsdebug("waking map %p for %pI4\n",
233 map, &map->m_addr);
234 rds_stats_inc(s_cong_update_received);
235 atomic_inc(&rds_cong_generation);
236 if (waitqueue_active(&map->m_waitq))
237 wake_up(&map->m_waitq);
238 if (waitqueue_active(&rds_poll_waitq))
239 wake_up_all(&rds_poll_waitq);
240
241 if (portmask && !list_empty(&rds_cong_monitor)) {
242 unsigned long flags;
243 struct rds_sock *rs;
244
245 read_lock_irqsave(&rds_cong_monitor_lock, flags);
246 list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
247 spin_lock(&rs->rs_lock);
248 rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
249 rs->rs_cong_mask &= ~portmask;
250 spin_unlock(&rs->rs_lock);
251 if (rs->rs_cong_notify)
252 rds_wake_sk_sleep(rs);
253 }
254 read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
255 }
256}
257
258int rds_cong_updated_since(unsigned long *recent)
259{
260 unsigned long gen = atomic_read(&rds_cong_generation);
261
262 if (likely(*recent == gen))
263 return 0;
264 *recent = gen;
265 return 1;
266}
267
268/*
269 * We're called under the locking that protects the sockets receive buffer
270 * consumption. This makes it a lot easier for the caller to only call us
271 * when it knows that an existing set bit needs to be cleared, and vice versa.
272 * We can't block and we need to deal with concurrent sockets working against
273 * the same per-address map.
274 */
275void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
276{
277 unsigned long i;
278 unsigned long off;
279
280 rdsdebug("setting congestion for %pI4:%u in map %p\n",
281 &map->m_addr, ntohs(port), map);
282
283 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
284 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
285
286 generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
287}
288
289void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
290{
291 unsigned long i;
292 unsigned long off;
293
294 rdsdebug("clearing congestion for %pI4:%u in map %p\n",
295 &map->m_addr, ntohs(port), map);
296
297 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
298 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
299
300 generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
301}
302
303static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
304{
305 unsigned long i;
306 unsigned long off;
307
308 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
309 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
310
311 return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
312}
313
314void rds_cong_add_socket(struct rds_sock *rs)
315{
316 unsigned long flags;
317
318 write_lock_irqsave(&rds_cong_monitor_lock, flags);
319 if (list_empty(&rs->rs_cong_list))
320 list_add(&rs->rs_cong_list, &rds_cong_monitor);
321 write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
322}
323
324void rds_cong_remove_socket(struct rds_sock *rs)
325{
326 unsigned long flags;
327 struct rds_cong_map *map;
328
329 write_lock_irqsave(&rds_cong_monitor_lock, flags);
330 list_del_init(&rs->rs_cong_list);
331 write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
332
333 /* update congestion map for now-closed port */
334 spin_lock_irqsave(&rds_cong_lock, flags);
335 map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
336 spin_unlock_irqrestore(&rds_cong_lock, flags);
337
338 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
339 rds_cong_clear_bit(map, rs->rs_bound_port);
340 rds_cong_queue_updates(map);
341 }
342}
343
344int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
345 struct rds_sock *rs)
346{
347 if (!rds_cong_test_bit(map, port))
348 return 0;
349 if (nonblock) {
350 if (rs && rs->rs_cong_monitor) {
351 unsigned long flags;
352
353 /* It would have been nice to have an atomic set_bit on
354 * a uint64_t. */
355 spin_lock_irqsave(&rs->rs_lock, flags);
356 rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
357 spin_unlock_irqrestore(&rs->rs_lock, flags);
358
359 /* Test again - a congestion update may have arrived in
360 * the meantime. */
361 if (!rds_cong_test_bit(map, port))
362 return 0;
363 }
364 rds_stats_inc(s_cong_send_error);
365 return -ENOBUFS;
366 }
367
368 rds_stats_inc(s_cong_send_blocked);
369 rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
370
371 return wait_event_interruptible(map->m_waitq,
372 !rds_cong_test_bit(map, port));
373}
374
375void rds_cong_exit(void)
376{
377 struct rb_node *node;
378 struct rds_cong_map *map;
379 unsigned long i;
380
381 while ((node = rb_first(&rds_cong_tree))) {
382 map = rb_entry(node, struct rds_cong_map, m_rb_node);
383 rdsdebug("freeing map %p\n", map);
384 rb_erase(&map->m_rb_node, &rds_cong_tree);
385 for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
386 free_page(map->m_page_addrs[i]);
387 kfree(map);
388 }
389}
390
391/*
392 * Allocate a RDS message containing a congestion update.
393 */
394struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
395{
396 struct rds_cong_map *map = conn->c_lcong;
397 struct rds_message *rm;
398
399 rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
400 if (!IS_ERR(rm))
401 rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
402
403 return rm;
404}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 000000000000..273f064930a8
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,487 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/list.h>
35#include <net/inet_hashtables.h>
36
37#include "rds.h"
38#include "loop.h"
39#include "rdma.h"
40
41#define RDS_CONNECTION_HASH_BITS 12
42#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
43#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
44
45/* converting this to RCU is a chore for another day.. */
46static DEFINE_SPINLOCK(rds_conn_lock);
47static unsigned long rds_conn_count;
48static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
49static struct kmem_cache *rds_conn_slab;
50
51static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
52{
53 /* Pass NULL, don't need struct net for hash */
54 unsigned long hash = inet_ehashfn(NULL,
55 be32_to_cpu(laddr), 0,
56 be32_to_cpu(faddr), 0);
57 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
58}
59
60#define rds_conn_info_set(var, test, suffix) do { \
61 if (test) \
62 var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
63} while (0)
64
65static inline int rds_conn_is_sending(struct rds_connection *conn)
66{
67 int ret = 0;
68
69 if (!mutex_trylock(&conn->c_send_lock))
70 ret = 1;
71 else
72 mutex_unlock(&conn->c_send_lock);
73
74 return ret;
75}
76
77static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
78 __be32 laddr, __be32 faddr,
79 struct rds_transport *trans)
80{
81 struct rds_connection *conn, *ret = NULL;
82 struct hlist_node *pos;
83
84 hlist_for_each_entry(conn, pos, head, c_hash_node) {
85 if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
86 conn->c_trans == trans) {
87 ret = conn;
88 break;
89 }
90 }
91 rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
92 &laddr, &faddr);
93 return ret;
94}
95
96/*
97 * This is called by transports as they're bringing down a connection.
98 * It clears partial message state so that the transport can start sending
99 * and receiving over this connection again in the future. It is up to
100 * the transport to have serialized this call with its send and recv.
101 */
102void rds_conn_reset(struct rds_connection *conn)
103{
104 rdsdebug("connection %pI4 to %pI4 reset\n",
105 &conn->c_laddr, &conn->c_faddr);
106
107 rds_stats_inc(s_conn_reset);
108 rds_send_reset(conn);
109 conn->c_flags = 0;
110
111 /* Do not clear next_rx_seq here, else we cannot distinguish
112 * retransmitted packets from new packets, and will hand all
113 * of them to the application. That is not consistent with the
114 * reliability guarantees of RDS. */
115}
116
117/*
118 * There is only every one 'conn' for a given pair of addresses in the
119 * system at a time. They contain messages to be retransmitted and so
120 * span the lifetime of the actual underlying transport connections.
121 *
122 * For now they are not garbage collected once they're created. They
123 * are torn down as the module is removed, if ever.
124 */
125static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
126 struct rds_transport *trans, gfp_t gfp,
127 int is_outgoing)
128{
129 struct rds_connection *conn, *tmp, *parent = NULL;
130 struct hlist_head *head = rds_conn_bucket(laddr, faddr);
131 unsigned long flags;
132 int ret;
133
134 spin_lock_irqsave(&rds_conn_lock, flags);
135 conn = rds_conn_lookup(head, laddr, faddr, trans);
136 if (conn
137 && conn->c_loopback
138 && conn->c_trans != &rds_loop_transport
139 && !is_outgoing) {
140 /* This is a looped back IB connection, and we're
141 * called by the code handling the incoming connect.
142 * We need a second connection object into which we
143 * can stick the other QP. */
144 parent = conn;
145 conn = parent->c_passive;
146 }
147 spin_unlock_irqrestore(&rds_conn_lock, flags);
148 if (conn)
149 goto out;
150
151 conn = kmem_cache_alloc(rds_conn_slab, gfp);
152 if (conn == NULL) {
153 conn = ERR_PTR(-ENOMEM);
154 goto out;
155 }
156
157 memset(conn, 0, sizeof(*conn));
158
159 INIT_HLIST_NODE(&conn->c_hash_node);
160 conn->c_version = RDS_PROTOCOL_3_0;
161 conn->c_laddr = laddr;
162 conn->c_faddr = faddr;
163 spin_lock_init(&conn->c_lock);
164 conn->c_next_tx_seq = 1;
165
166 mutex_init(&conn->c_send_lock);
167 INIT_LIST_HEAD(&conn->c_send_queue);
168 INIT_LIST_HEAD(&conn->c_retrans);
169
170 ret = rds_cong_get_maps(conn);
171 if (ret) {
172 kmem_cache_free(rds_conn_slab, conn);
173 conn = ERR_PTR(ret);
174 goto out;
175 }
176
177 /*
178 * This is where a connection becomes loopback. If *any* RDS sockets
179 * can bind to the destination address then we'd rather the messages
180 * flow through loopback rather than either transport.
181 */
182 if (rds_trans_get_preferred(faddr)) {
183 conn->c_loopback = 1;
184 if (is_outgoing && trans->t_prefer_loopback) {
185 /* "outgoing" connection - and the transport
186 * says it wants the connection handled by the
187 * loopback transport. This is what TCP does.
188 */
189 trans = &rds_loop_transport;
190 }
191 }
192
193 conn->c_trans = trans;
194
195 ret = trans->conn_alloc(conn, gfp);
196 if (ret) {
197 kmem_cache_free(rds_conn_slab, conn);
198 conn = ERR_PTR(ret);
199 goto out;
200 }
201
202 atomic_set(&conn->c_state, RDS_CONN_DOWN);
203 conn->c_reconnect_jiffies = 0;
204 INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
205 INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
206 INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
207 INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
208 mutex_init(&conn->c_cm_lock);
209 conn->c_flags = 0;
210
211 rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
212 conn, &laddr, &faddr,
213 trans->t_name ? trans->t_name : "[unknown]",
214 is_outgoing ? "(outgoing)" : "");
215
216 spin_lock_irqsave(&rds_conn_lock, flags);
217 if (parent == NULL) {
218 tmp = rds_conn_lookup(head, laddr, faddr, trans);
219 if (tmp == NULL)
220 hlist_add_head(&conn->c_hash_node, head);
221 } else {
222 tmp = parent->c_passive;
223 if (!tmp)
224 parent->c_passive = conn;
225 }
226
227 if (tmp) {
228 trans->conn_free(conn->c_transport_data);
229 kmem_cache_free(rds_conn_slab, conn);
230 conn = tmp;
231 } else {
232 rds_cong_add_conn(conn);
233 rds_conn_count++;
234 }
235
236 spin_unlock_irqrestore(&rds_conn_lock, flags);
237
238out:
239 return conn;
240}
241
242struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
243 struct rds_transport *trans, gfp_t gfp)
244{
245 return __rds_conn_create(laddr, faddr, trans, gfp, 0);
246}
247
248struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
249 struct rds_transport *trans, gfp_t gfp)
250{
251 return __rds_conn_create(laddr, faddr, trans, gfp, 1);
252}
253
254void rds_conn_destroy(struct rds_connection *conn)
255{
256 struct rds_message *rm, *rtmp;
257
258 rdsdebug("freeing conn %p for %pI4 -> "
259 "%pI4\n", conn, &conn->c_laddr,
260 &conn->c_faddr);
261
262 hlist_del_init(&conn->c_hash_node);
263
264 /* wait for the rds thread to shut it down */
265 atomic_set(&conn->c_state, RDS_CONN_ERROR);
266 cancel_delayed_work(&conn->c_conn_w);
267 queue_work(rds_wq, &conn->c_down_w);
268 flush_workqueue(rds_wq);
269
270 /* tear down queued messages */
271 list_for_each_entry_safe(rm, rtmp,
272 &conn->c_send_queue,
273 m_conn_item) {
274 list_del_init(&rm->m_conn_item);
275 BUG_ON(!list_empty(&rm->m_sock_item));
276 rds_message_put(rm);
277 }
278 if (conn->c_xmit_rm)
279 rds_message_put(conn->c_xmit_rm);
280
281 conn->c_trans->conn_free(conn->c_transport_data);
282
283 /*
284 * The congestion maps aren't freed up here. They're
285 * freed by rds_cong_exit() after all the connections
286 * have been freed.
287 */
288 rds_cong_remove_conn(conn);
289
290 BUG_ON(!list_empty(&conn->c_retrans));
291 kmem_cache_free(rds_conn_slab, conn);
292
293 rds_conn_count--;
294}
295
296static void rds_conn_message_info(struct socket *sock, unsigned int len,
297 struct rds_info_iterator *iter,
298 struct rds_info_lengths *lens,
299 int want_send)
300{
301 struct hlist_head *head;
302 struct hlist_node *pos;
303 struct list_head *list;
304 struct rds_connection *conn;
305 struct rds_message *rm;
306 unsigned long flags;
307 unsigned int total = 0;
308 size_t i;
309
310 len /= sizeof(struct rds_info_message);
311
312 spin_lock_irqsave(&rds_conn_lock, flags);
313
314 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
315 i++, head++) {
316 hlist_for_each_entry(conn, pos, head, c_hash_node) {
317 if (want_send)
318 list = &conn->c_send_queue;
319 else
320 list = &conn->c_retrans;
321
322 spin_lock(&conn->c_lock);
323
324 /* XXX too lazy to maintain counts.. */
325 list_for_each_entry(rm, list, m_conn_item) {
326 total++;
327 if (total <= len)
328 rds_inc_info_copy(&rm->m_inc, iter,
329 conn->c_laddr,
330 conn->c_faddr, 0);
331 }
332
333 spin_unlock(&conn->c_lock);
334 }
335 }
336
337 spin_unlock_irqrestore(&rds_conn_lock, flags);
338
339 lens->nr = total;
340 lens->each = sizeof(struct rds_info_message);
341}
342
343static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
344 struct rds_info_iterator *iter,
345 struct rds_info_lengths *lens)
346{
347 rds_conn_message_info(sock, len, iter, lens, 1);
348}
349
350static void rds_conn_message_info_retrans(struct socket *sock,
351 unsigned int len,
352 struct rds_info_iterator *iter,
353 struct rds_info_lengths *lens)
354{
355 rds_conn_message_info(sock, len, iter, lens, 0);
356}
357
358void rds_for_each_conn_info(struct socket *sock, unsigned int len,
359 struct rds_info_iterator *iter,
360 struct rds_info_lengths *lens,
361 int (*visitor)(struct rds_connection *, void *),
362 size_t item_len)
363{
364 uint64_t buffer[(item_len + 7) / 8];
365 struct hlist_head *head;
366 struct hlist_node *pos;
367 struct hlist_node *tmp;
368 struct rds_connection *conn;
369 unsigned long flags;
370 size_t i;
371
372 spin_lock_irqsave(&rds_conn_lock, flags);
373
374 lens->nr = 0;
375 lens->each = item_len;
376
377 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
378 i++, head++) {
379 hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
380
381 /* XXX no c_lock usage.. */
382 if (!visitor(conn, buffer))
383 continue;
384
385 /* We copy as much as we can fit in the buffer,
386 * but we count all items so that the caller
387 * can resize the buffer. */
388 if (len >= item_len) {
389 rds_info_copy(iter, buffer, item_len);
390 len -= item_len;
391 }
392 lens->nr++;
393 }
394 }
395
396 spin_unlock_irqrestore(&rds_conn_lock, flags);
397}
398
399static int rds_conn_info_visitor(struct rds_connection *conn,
400 void *buffer)
401{
402 struct rds_info_connection *cinfo = buffer;
403
404 cinfo->next_tx_seq = conn->c_next_tx_seq;
405 cinfo->next_rx_seq = conn->c_next_rx_seq;
406 cinfo->laddr = conn->c_laddr;
407 cinfo->faddr = conn->c_faddr;
408 strncpy(cinfo->transport, conn->c_trans->t_name,
409 sizeof(cinfo->transport));
410 cinfo->flags = 0;
411
412 rds_conn_info_set(cinfo->flags,
413 rds_conn_is_sending(conn), SENDING);
414 /* XXX Future: return the state rather than these funky bits */
415 rds_conn_info_set(cinfo->flags,
416 atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
417 CONNECTING);
418 rds_conn_info_set(cinfo->flags,
419 atomic_read(&conn->c_state) == RDS_CONN_UP,
420 CONNECTED);
421 return 1;
422}
423
424static void rds_conn_info(struct socket *sock, unsigned int len,
425 struct rds_info_iterator *iter,
426 struct rds_info_lengths *lens)
427{
428 rds_for_each_conn_info(sock, len, iter, lens,
429 rds_conn_info_visitor,
430 sizeof(struct rds_info_connection));
431}
432
433int __init rds_conn_init(void)
434{
435 rds_conn_slab = kmem_cache_create("rds_connection",
436 sizeof(struct rds_connection),
437 0, 0, NULL);
438 if (rds_conn_slab == NULL)
439 return -ENOMEM;
440
441 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
442 rds_info_register_func(RDS_INFO_SEND_MESSAGES,
443 rds_conn_message_info_send);
444 rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
445 rds_conn_message_info_retrans);
446
447 return 0;
448}
449
450void rds_conn_exit(void)
451{
452 rds_loop_exit();
453
454 WARN_ON(!hlist_empty(rds_conn_hash));
455
456 kmem_cache_destroy(rds_conn_slab);
457
458 rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
459 rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
460 rds_conn_message_info_send);
461 rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
462 rds_conn_message_info_retrans);
463}
464
465/*
466 * Force a disconnect
467 */
468void rds_conn_drop(struct rds_connection *conn)
469{
470 atomic_set(&conn->c_state, RDS_CONN_ERROR);
471 queue_work(rds_wq, &conn->c_down_w);
472}
473
474/*
475 * An error occurred on the connection
476 */
477void
478__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
479{
480 va_list ap;
481
482 va_start(ap, fmt);
483 vprintk(fmt, ap);
484 va_end(ap);
485
486 rds_conn_drop(conn);
487}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 000000000000..06a7b798d9a7
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,323 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/if.h>
36#include <linux/netdevice.h>
37#include <linux/inetdevice.h>
38#include <linux/if_arp.h>
39#include <linux/delay.h>
40
41#include "rds.h"
42#include "ib.h"
43
44unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
45unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
46
47module_param(fmr_pool_size, int, 0444);
48MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
49module_param(fmr_message_size, int, 0444);
50MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
51
52struct list_head rds_ib_devices;
53
54DEFINE_SPINLOCK(ib_nodev_conns_lock);
55LIST_HEAD(ib_nodev_conns);
56
57void rds_ib_add_one(struct ib_device *device)
58{
59 struct rds_ib_device *rds_ibdev;
60 struct ib_device_attr *dev_attr;
61
62 /* Only handle IB (no iWARP) devices */
63 if (device->node_type != RDMA_NODE_IB_CA)
64 return;
65
66 dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
67 if (!dev_attr)
68 return;
69
70 if (ib_query_device(device, dev_attr)) {
71 rdsdebug("Query device failed for %s\n", device->name);
72 goto free_attr;
73 }
74
75 rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
76 if (!rds_ibdev)
77 goto free_attr;
78
79 spin_lock_init(&rds_ibdev->spinlock);
80
81 rds_ibdev->max_wrs = dev_attr->max_qp_wr;
82 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
83
84 rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
85 rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift;
86 rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1);
87 rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
88 rds_ibdev->max_fmrs = dev_attr->max_fmr ?
89 min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
90 fmr_pool_size;
91
92 rds_ibdev->dev = device;
93 rds_ibdev->pd = ib_alloc_pd(device);
94 if (IS_ERR(rds_ibdev->pd))
95 goto free_dev;
96
97 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
98 IB_ACCESS_LOCAL_WRITE);
99 if (IS_ERR(rds_ibdev->mr))
100 goto err_pd;
101
102 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
103 if (IS_ERR(rds_ibdev->mr_pool)) {
104 rds_ibdev->mr_pool = NULL;
105 goto err_mr;
106 }
107
108 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
109 INIT_LIST_HEAD(&rds_ibdev->conn_list);
110 list_add_tail(&rds_ibdev->list, &rds_ib_devices);
111
112 ib_set_client_data(device, &rds_ib_client, rds_ibdev);
113
114 goto free_attr;
115
116err_mr:
117 ib_dereg_mr(rds_ibdev->mr);
118err_pd:
119 ib_dealloc_pd(rds_ibdev->pd);
120free_dev:
121 kfree(rds_ibdev);
122free_attr:
123 kfree(dev_attr);
124}
125
126void rds_ib_remove_one(struct ib_device *device)
127{
128 struct rds_ib_device *rds_ibdev;
129 struct rds_ib_ipaddr *i_ipaddr, *i_next;
130
131 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
132 if (!rds_ibdev)
133 return;
134
135 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
136 list_del(&i_ipaddr->list);
137 kfree(i_ipaddr);
138 }
139
140 rds_ib_remove_conns(rds_ibdev);
141
142 if (rds_ibdev->mr_pool)
143 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
144
145 ib_dereg_mr(rds_ibdev->mr);
146
147 while (ib_dealloc_pd(rds_ibdev->pd)) {
148 rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
149 msleep(1);
150 }
151
152 list_del(&rds_ibdev->list);
153 kfree(rds_ibdev);
154}
155
156struct ib_client rds_ib_client = {
157 .name = "rds_ib",
158 .add = rds_ib_add_one,
159 .remove = rds_ib_remove_one
160};
161
162static int rds_ib_conn_info_visitor(struct rds_connection *conn,
163 void *buffer)
164{
165 struct rds_info_rdma_connection *iinfo = buffer;
166 struct rds_ib_connection *ic;
167
168 /* We will only ever look at IB transports */
169 if (conn->c_trans != &rds_ib_transport)
170 return 0;
171
172 iinfo->src_addr = conn->c_laddr;
173 iinfo->dst_addr = conn->c_faddr;
174
175 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
176 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
177 if (rds_conn_state(conn) == RDS_CONN_UP) {
178 struct rds_ib_device *rds_ibdev;
179 struct rdma_dev_addr *dev_addr;
180
181 ic = conn->c_transport_data;
182 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
183
184 ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
185 ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
186
187 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
188 iinfo->max_send_wr = ic->i_send_ring.w_nr;
189 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
190 iinfo->max_send_sge = rds_ibdev->max_sge;
191 rds_ib_get_mr_info(rds_ibdev, iinfo);
192 }
193 return 1;
194}
195
196static void rds_ib_ic_info(struct socket *sock, unsigned int len,
197 struct rds_info_iterator *iter,
198 struct rds_info_lengths *lens)
199{
200 rds_for_each_conn_info(sock, len, iter, lens,
201 rds_ib_conn_info_visitor,
202 sizeof(struct rds_info_rdma_connection));
203}
204
205
206/*
207 * Early RDS/IB was built to only bind to an address if there is an IPoIB
208 * device with that address set.
209 *
210 * If it were me, I'd advocate for something more flexible. Sending and
211 * receiving should be device-agnostic. Transports would try and maintain
212 * connections between peers who have messages queued. Userspace would be
213 * allowed to influence which paths have priority. We could call userspace
214 * asserting this policy "routing".
215 */
216static int rds_ib_laddr_check(__be32 addr)
217{
218 int ret;
219 struct rdma_cm_id *cm_id;
220 struct sockaddr_in sin;
221
222 /* Create a CMA ID and try to bind it. This catches both
223 * IB and iWARP capable NICs.
224 */
225 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
226 if (!cm_id)
227 return -EADDRNOTAVAIL;
228
229 memset(&sin, 0, sizeof(sin));
230 sin.sin_family = AF_INET;
231 sin.sin_addr.s_addr = addr;
232
233 /* rdma_bind_addr will only succeed for IB & iWARP devices */
234 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
235 /* due to this, we will claim to support iWARP devices unless we
236 check node_type. */
237 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
238 ret = -EADDRNOTAVAIL;
239
240 rdsdebug("addr %pI4 ret %d node type %d\n",
241 &addr, ret,
242 cm_id->device ? cm_id->device->node_type : -1);
243
244 rdma_destroy_id(cm_id);
245
246 return ret;
247}
248
249void rds_ib_exit(void)
250{
251 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
252 rds_ib_remove_nodev_conns();
253 ib_unregister_client(&rds_ib_client);
254 rds_ib_sysctl_exit();
255 rds_ib_recv_exit();
256 rds_trans_unregister(&rds_ib_transport);
257}
258
259struct rds_transport rds_ib_transport = {
260 .laddr_check = rds_ib_laddr_check,
261 .xmit_complete = rds_ib_xmit_complete,
262 .xmit = rds_ib_xmit,
263 .xmit_cong_map = NULL,
264 .xmit_rdma = rds_ib_xmit_rdma,
265 .recv = rds_ib_recv,
266 .conn_alloc = rds_ib_conn_alloc,
267 .conn_free = rds_ib_conn_free,
268 .conn_connect = rds_ib_conn_connect,
269 .conn_shutdown = rds_ib_conn_shutdown,
270 .inc_copy_to_user = rds_ib_inc_copy_to_user,
271 .inc_purge = rds_ib_inc_purge,
272 .inc_free = rds_ib_inc_free,
273 .cm_initiate_connect = rds_ib_cm_initiate_connect,
274 .cm_handle_connect = rds_ib_cm_handle_connect,
275 .cm_connect_complete = rds_ib_cm_connect_complete,
276 .stats_info_copy = rds_ib_stats_info_copy,
277 .exit = rds_ib_exit,
278 .get_mr = rds_ib_get_mr,
279 .sync_mr = rds_ib_sync_mr,
280 .free_mr = rds_ib_free_mr,
281 .flush_mrs = rds_ib_flush_mrs,
282 .t_owner = THIS_MODULE,
283 .t_name = "infiniband",
284};
285
286int __init rds_ib_init(void)
287{
288 int ret;
289
290 INIT_LIST_HEAD(&rds_ib_devices);
291
292 ret = ib_register_client(&rds_ib_client);
293 if (ret)
294 goto out;
295
296 ret = rds_ib_sysctl_init();
297 if (ret)
298 goto out_ibreg;
299
300 ret = rds_ib_recv_init();
301 if (ret)
302 goto out_sysctl;
303
304 ret = rds_trans_register(&rds_ib_transport);
305 if (ret)
306 goto out_recv;
307
308 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
309
310 goto out;
311
312out_recv:
313 rds_ib_recv_exit();
314out_sysctl:
315 rds_ib_sysctl_exit();
316out_ibreg:
317 ib_unregister_client(&rds_ib_client);
318out:
319 return ret;
320}
321
322MODULE_LICENSE("GPL");
323
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 000000000000..8be563a1363a
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,367 @@
1#ifndef _RDS_IB_H
2#define _RDS_IB_H
3
4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h>
6#include "rds.h"
7#include "rdma_transport.h"
8
9#define RDS_FMR_SIZE 256
10#define RDS_FMR_POOL_SIZE 4096
11
12#define RDS_IB_MAX_SGE 8
13#define RDS_IB_RECV_SGE 2
14
15#define RDS_IB_DEFAULT_RECV_WR 1024
16#define RDS_IB_DEFAULT_SEND_WR 256
17
18#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
19
20extern struct list_head rds_ib_devices;
21
22/*
23 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
24 * try and minimize the amount of memory tied up both the device and
25 * socket receive queues.
26 */
27/* page offset of the final full frag that fits in the page */
28#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
29struct rds_page_frag {
30 struct list_head f_item;
31 struct page *f_page;
32 unsigned long f_offset;
33 dma_addr_t f_mapped;
34};
35
36struct rds_ib_incoming {
37 struct list_head ii_frags;
38 struct rds_incoming ii_inc;
39};
40
41struct rds_ib_connect_private {
42 /* Add new fields at the end, and don't permute existing fields. */
43 __be32 dp_saddr;
44 __be32 dp_daddr;
45 u8 dp_protocol_major;
46 u8 dp_protocol_minor;
47 __be16 dp_protocol_minor_mask; /* bitmask */
48 __be32 dp_reserved1;
49 __be64 dp_ack_seq;
50 __be32 dp_credit; /* non-zero enables flow ctl */
51};
52
53struct rds_ib_send_work {
54 struct rds_message *s_rm;
55 struct rds_rdma_op *s_op;
56 struct ib_send_wr s_wr;
57 struct ib_sge s_sge[RDS_IB_MAX_SGE];
58 unsigned long s_queued;
59};
60
61struct rds_ib_recv_work {
62 struct rds_ib_incoming *r_ibinc;
63 struct rds_page_frag *r_frag;
64 struct ib_recv_wr r_wr;
65 struct ib_sge r_sge[2];
66};
67
68struct rds_ib_work_ring {
69 u32 w_nr;
70 u32 w_alloc_ptr;
71 u32 w_alloc_ctr;
72 u32 w_free_ptr;
73 atomic_t w_free_ctr;
74};
75
76struct rds_ib_device;
77
78struct rds_ib_connection {
79
80 struct list_head ib_node;
81 struct rds_ib_device *rds_ibdev;
82 struct rds_connection *conn;
83
84 /* alphabet soup, IBTA style */
85 struct rdma_cm_id *i_cm_id;
86 struct ib_pd *i_pd;
87 struct ib_mr *i_mr;
88 struct ib_cq *i_send_cq;
89 struct ib_cq *i_recv_cq;
90
91 /* tx */
92 struct rds_ib_work_ring i_send_ring;
93 struct rds_message *i_rm;
94 struct rds_header *i_send_hdrs;
95 u64 i_send_hdrs_dma;
96 struct rds_ib_send_work *i_sends;
97
98 /* rx */
99 struct mutex i_recv_mutex;
100 struct rds_ib_work_ring i_recv_ring;
101 struct rds_ib_incoming *i_ibinc;
102 u32 i_recv_data_rem;
103 struct rds_header *i_recv_hdrs;
104 u64 i_recv_hdrs_dma;
105 struct rds_ib_recv_work *i_recvs;
106 struct rds_page_frag i_frag;
107 u64 i_ack_recv; /* last ACK received */
108
109 /* sending acks */
110 unsigned long i_ack_flags;
111 u64 i_ack_next; /* next ACK to send */
112 struct rds_header *i_ack;
113 struct ib_send_wr i_ack_wr;
114 struct ib_sge i_ack_sge;
115 u64 i_ack_dma;
116 unsigned long i_ack_queued;
117
118 /* Flow control related information
119 *
120 * Our algorithm uses a pair variables that we need to access
121 * atomically - one for the send credits, and one posted
122 * recv credits we need to transfer to remote.
123 * Rather than protect them using a slow spinlock, we put both into
124 * a single atomic_t and update it using cmpxchg
125 */
126 atomic_t i_credits;
127
128 /* Protocol version specific information */
129 unsigned int i_flowctl:1; /* enable/disable flow ctl */
130
131 /* Batched completions */
132 unsigned int i_unsignaled_wrs;
133 long i_unsignaled_bytes;
134};
135
136/* This assumes that atomic_t is at least 32 bits */
137#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
138#define IB_GET_POST_CREDITS(v) ((v) >> 16)
139#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
140#define IB_SET_POST_CREDITS(v) ((v) << 16)
141
142struct rds_ib_ipaddr {
143 struct list_head list;
144 __be32 ipaddr;
145};
146
147struct rds_ib_device {
148 struct list_head list;
149 struct list_head ipaddr_list;
150 struct list_head conn_list;
151 struct ib_device *dev;
152 struct ib_pd *pd;
153 struct ib_mr *mr;
154 struct rds_ib_mr_pool *mr_pool;
155 int fmr_page_shift;
156 int fmr_page_size;
157 u64 fmr_page_mask;
158 unsigned int fmr_max_remaps;
159 unsigned int max_fmrs;
160 int max_sge;
161 unsigned int max_wrs;
162 spinlock_t spinlock; /* protect the above */
163};
164
165/* bits for i_ack_flags */
166#define IB_ACK_IN_FLIGHT 0
167#define IB_ACK_REQUESTED 1
168
169/* Magic WR_ID for ACKs */
170#define RDS_IB_ACK_WR_ID (~(u64) 0)
171
172struct rds_ib_statistics {
173 uint64_t s_ib_connect_raced;
174 uint64_t s_ib_listen_closed_stale;
175 uint64_t s_ib_tx_cq_call;
176 uint64_t s_ib_tx_cq_event;
177 uint64_t s_ib_tx_ring_full;
178 uint64_t s_ib_tx_throttle;
179 uint64_t s_ib_tx_sg_mapping_failure;
180 uint64_t s_ib_tx_stalled;
181 uint64_t s_ib_tx_credit_updates;
182 uint64_t s_ib_rx_cq_call;
183 uint64_t s_ib_rx_cq_event;
184 uint64_t s_ib_rx_ring_empty;
185 uint64_t s_ib_rx_refill_from_cq;
186 uint64_t s_ib_rx_refill_from_thread;
187 uint64_t s_ib_rx_alloc_limit;
188 uint64_t s_ib_rx_credit_updates;
189 uint64_t s_ib_ack_sent;
190 uint64_t s_ib_ack_send_failure;
191 uint64_t s_ib_ack_send_delayed;
192 uint64_t s_ib_ack_send_piggybacked;
193 uint64_t s_ib_ack_received;
194 uint64_t s_ib_rdma_mr_alloc;
195 uint64_t s_ib_rdma_mr_free;
196 uint64_t s_ib_rdma_mr_used;
197 uint64_t s_ib_rdma_mr_pool_flush;
198 uint64_t s_ib_rdma_mr_pool_wait;
199 uint64_t s_ib_rdma_mr_pool_depleted;
200};
201
202extern struct workqueue_struct *rds_ib_wq;
203
204/*
205 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
206 * doesn't define it.
207 */
208static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
209 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
210{
211 unsigned int i;
212
213 for (i = 0; i < sg_dma_len; ++i) {
214 ib_dma_sync_single_for_cpu(dev,
215 ib_sg_dma_address(dev, &sg[i]),
216 ib_sg_dma_len(dev, &sg[i]),
217 direction);
218 }
219}
220#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
221
222static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
223 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
224{
225 unsigned int i;
226
227 for (i = 0; i < sg_dma_len; ++i) {
228 ib_dma_sync_single_for_device(dev,
229 ib_sg_dma_address(dev, &sg[i]),
230 ib_sg_dma_len(dev, &sg[i]),
231 direction);
232 }
233}
234#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device
235
236
237/* ib.c */
238extern struct rds_transport rds_ib_transport;
239extern void rds_ib_add_one(struct ib_device *device);
240extern void rds_ib_remove_one(struct ib_device *device);
241extern struct ib_client rds_ib_client;
242
243extern unsigned int fmr_pool_size;
244extern unsigned int fmr_message_size;
245
246extern spinlock_t ib_nodev_conns_lock;
247extern struct list_head ib_nodev_conns;
248
249/* ib_cm.c */
250int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
251void rds_ib_conn_free(void *arg);
252int rds_ib_conn_connect(struct rds_connection *conn);
253void rds_ib_conn_shutdown(struct rds_connection *conn);
254void rds_ib_state_change(struct sock *sk);
255int __init rds_ib_listen_init(void);
256void rds_ib_listen_stop(void);
257void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
258int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
259 struct rdma_cm_event *event);
260int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
261void rds_ib_cm_connect_complete(struct rds_connection *conn,
262 struct rdma_cm_event *event);
263
264
265#define rds_ib_conn_error(conn, fmt...) \
266 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
267
268/* ib_rdma.c */
269int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
270int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
271void rds_ib_remove_nodev_conns(void);
272void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev);
273struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
274void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
275void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
276void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
277 struct rds_sock *rs, u32 *key_ret);
278void rds_ib_sync_mr(void *trans_private, int dir);
279void rds_ib_free_mr(void *trans_private, int invalidate);
280void rds_ib_flush_mrs(void);
281
282/* ib_recv.c */
283int __init rds_ib_recv_init(void);
284void rds_ib_recv_exit(void);
285int rds_ib_recv(struct rds_connection *conn);
286int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
287 gfp_t page_gfp, int prefill);
288void rds_ib_inc_purge(struct rds_incoming *inc);
289void rds_ib_inc_free(struct rds_incoming *inc);
290int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
291 size_t size);
292void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
293void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
294void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
295void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
296void rds_ib_attempt_ack(struct rds_ib_connection *ic);
297void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
298u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
299
300/* ib_ring.c */
301void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
302void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
303u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
304void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
305void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
306int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
307int rds_ib_ring_low(struct rds_ib_work_ring *ring);
308u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
309u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
310extern wait_queue_head_t rds_ib_ring_empty_wait;
311
312/* ib_send.c */
313void rds_ib_xmit_complete(struct rds_connection *conn);
314int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
315 unsigned int hdr_off, unsigned int sg, unsigned int off);
316void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
317void rds_ib_send_init_ring(struct rds_ib_connection *ic);
318void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
319int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
320void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
321void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
322int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
323 u32 *adv_credits, int need_posted);
324
325/* ib_stats.c */
326DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
327#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
328unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
329 unsigned int avail);
330
331/* ib_sysctl.c */
332int __init rds_ib_sysctl_init(void);
333void rds_ib_sysctl_exit(void);
334extern unsigned long rds_ib_sysctl_max_send_wr;
335extern unsigned long rds_ib_sysctl_max_recv_wr;
336extern unsigned long rds_ib_sysctl_max_unsig_wrs;
337extern unsigned long rds_ib_sysctl_max_unsig_bytes;
338extern unsigned long rds_ib_sysctl_max_recv_allocation;
339extern unsigned int rds_ib_sysctl_flow_control;
340extern ctl_table rds_ib_sysctl_table[];
341
342/*
343 * Helper functions for getting/setting the header and data SGEs in
344 * RDS packets (not RDMA)
345 */
346static inline struct ib_sge *
347rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
348{
349 return &sge[0];
350}
351
352static inline struct ib_sge *
353rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
354{
355 return &sge[1];
356}
357
358static inline void rds_ib_set_64bit(u64 *ptr, u64 val)
359{
360#if BITS_PER_LONG == 64
361 *ptr = val;
362#else
363 set_64bit(ptr, val);
364#endif
365}
366
367#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 000000000000..0532237bd128
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,726 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/vmalloc.h>
36
37#include "rds.h"
38#include "ib.h"
39
40/*
41 * Set the selected protocol version
42 */
43static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
44{
45 conn->c_version = version;
46}
47
48/*
49 * Set up flow control
50 */
51static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
52{
53 struct rds_ib_connection *ic = conn->c_transport_data;
54
55 if (rds_ib_sysctl_flow_control && credits != 0) {
56 /* We're doing flow control */
57 ic->i_flowctl = 1;
58 rds_ib_send_add_credits(conn, credits);
59 } else {
60 ic->i_flowctl = 0;
61 }
62}
63
64/*
65 * Tune RNR behavior. Without flow control, we use a rather
66 * low timeout, but not the absolute minimum - this should
67 * be tunable.
68 *
69 * We already set the RNR retry count to 7 (which is the
70 * smallest infinite number :-) above.
71 * If flow control is off, we want to change this back to 0
72 * so that we learn quickly when our credit accounting is
73 * buggy.
74 *
75 * Caller passes in a qp_attr pointer - don't waste stack spacv
76 * by allocation this twice.
77 */
78static void
79rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
80{
81 int ret;
82
83 attr->min_rnr_timer = IB_RNR_TIMER_000_32;
84 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
85 if (ret)
86 printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
87}
88
89/*
90 * Connection established.
91 * We get here for both outgoing and incoming connection.
92 */
93void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
94{
95 const struct rds_ib_connect_private *dp = NULL;
96 struct rds_ib_connection *ic = conn->c_transport_data;
97 struct rds_ib_device *rds_ibdev;
98 struct ib_qp_attr qp_attr;
99 int err;
100
101 if (event->param.conn.private_data_len) {
102 dp = event->param.conn.private_data;
103
104 rds_ib_set_protocol(conn,
105 RDS_PROTOCOL(dp->dp_protocol_major,
106 dp->dp_protocol_minor));
107 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
108 }
109
110 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
111 &conn->c_laddr,
112 RDS_PROTOCOL_MAJOR(conn->c_version),
113 RDS_PROTOCOL_MINOR(conn->c_version),
114 ic->i_flowctl ? ", flow control" : "");
115
116 /* Tune RNR behavior */
117 rds_ib_tune_rnr(ic, &qp_attr);
118
119 qp_attr.qp_state = IB_QPS_RTS;
120 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
121 if (err)
122 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
123
124 /* update ib_device with this local ipaddr & conn */
125 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
126 err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
127 if (err)
128 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
129 err = rds_ib_add_conn(rds_ibdev, conn);
130 if (err)
131 printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err);
132
133 /* If the peer gave us the last packet it saw, process this as if
134 * we had received a regular ACK. */
135 if (dp && dp->dp_ack_seq)
136 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
137
138 rds_connect_complete(conn);
139}
140
141static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
142 struct rdma_conn_param *conn_param,
143 struct rds_ib_connect_private *dp,
144 u32 protocol_version)
145{
146 memset(conn_param, 0, sizeof(struct rdma_conn_param));
147 /* XXX tune these? */
148 conn_param->responder_resources = 1;
149 conn_param->initiator_depth = 1;
150 conn_param->retry_count = 7;
151 conn_param->rnr_retry_count = 7;
152
153 if (dp) {
154 struct rds_ib_connection *ic = conn->c_transport_data;
155
156 memset(dp, 0, sizeof(*dp));
157 dp->dp_saddr = conn->c_laddr;
158 dp->dp_daddr = conn->c_faddr;
159 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
160 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
161 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
162 dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
163
164 /* Advertise flow control */
165 if (ic->i_flowctl) {
166 unsigned int credits;
167
168 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
169 dp->dp_credit = cpu_to_be32(credits);
170 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
171 }
172
173 conn_param->private_data = dp;
174 conn_param->private_data_len = sizeof(*dp);
175 }
176}
177
178static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
179{
180 rdsdebug("event %u data %p\n", event->event, data);
181}
182
183static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
184{
185 struct rds_connection *conn = data;
186 struct rds_ib_connection *ic = conn->c_transport_data;
187
188 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
189
190 switch (event->event) {
191 case IB_EVENT_COMM_EST:
192 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
193 break;
194 default:
195 printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
196 "on connection to %pI4\n", event->event,
197 &conn->c_faddr);
198 break;
199 }
200}
201
202/*
203 * This needs to be very careful to not leave IS_ERR pointers around for
204 * cleanup to trip over.
205 */
206static int rds_ib_setup_qp(struct rds_connection *conn)
207{
208 struct rds_ib_connection *ic = conn->c_transport_data;
209 struct ib_device *dev = ic->i_cm_id->device;
210 struct ib_qp_init_attr attr;
211 struct rds_ib_device *rds_ibdev;
212 int ret;
213
214 /* rds_ib_add_one creates a rds_ib_device object per IB device,
215 * and allocates a protection domain, memory range and FMR pool
216 * for each. If that fails for any reason, it will not register
217 * the rds_ibdev at all.
218 */
219 rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
220 if (rds_ibdev == NULL) {
221 if (printk_ratelimit())
222 printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
223 dev->name);
224 return -EOPNOTSUPP;
225 }
226
227 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
228 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
229 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
230 rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
231
232 /* Protection domain and memory range */
233 ic->i_pd = rds_ibdev->pd;
234 ic->i_mr = rds_ibdev->mr;
235
236 ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
237 rds_ib_cq_event_handler, conn,
238 ic->i_send_ring.w_nr + 1, 0);
239 if (IS_ERR(ic->i_send_cq)) {
240 ret = PTR_ERR(ic->i_send_cq);
241 ic->i_send_cq = NULL;
242 rdsdebug("ib_create_cq send failed: %d\n", ret);
243 goto out;
244 }
245
246 ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
247 rds_ib_cq_event_handler, conn,
248 ic->i_recv_ring.w_nr, 0);
249 if (IS_ERR(ic->i_recv_cq)) {
250 ret = PTR_ERR(ic->i_recv_cq);
251 ic->i_recv_cq = NULL;
252 rdsdebug("ib_create_cq recv failed: %d\n", ret);
253 goto out;
254 }
255
256 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
257 if (ret) {
258 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
259 goto out;
260 }
261
262 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
263 if (ret) {
264 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
265 goto out;
266 }
267
268 /* XXX negotiate max send/recv with remote? */
269 memset(&attr, 0, sizeof(attr));
270 attr.event_handler = rds_ib_qp_event_handler;
271 attr.qp_context = conn;
272 /* + 1 to allow for the single ack message */
273 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
274 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
275 attr.cap.max_send_sge = rds_ibdev->max_sge;
276 attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
277 attr.sq_sig_type = IB_SIGNAL_REQ_WR;
278 attr.qp_type = IB_QPT_RC;
279 attr.send_cq = ic->i_send_cq;
280 attr.recv_cq = ic->i_recv_cq;
281
282 /*
283 * XXX this can fail if max_*_wr is too large? Are we supposed
284 * to back off until we get a value that the hardware can support?
285 */
286 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
287 if (ret) {
288 rdsdebug("rdma_create_qp failed: %d\n", ret);
289 goto out;
290 }
291
292 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
293 ic->i_send_ring.w_nr *
294 sizeof(struct rds_header),
295 &ic->i_send_hdrs_dma, GFP_KERNEL);
296 if (ic->i_send_hdrs == NULL) {
297 ret = -ENOMEM;
298 rdsdebug("ib_dma_alloc_coherent send failed\n");
299 goto out;
300 }
301
302 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
303 ic->i_recv_ring.w_nr *
304 sizeof(struct rds_header),
305 &ic->i_recv_hdrs_dma, GFP_KERNEL);
306 if (ic->i_recv_hdrs == NULL) {
307 ret = -ENOMEM;
308 rdsdebug("ib_dma_alloc_coherent recv failed\n");
309 goto out;
310 }
311
312 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
313 &ic->i_ack_dma, GFP_KERNEL);
314 if (ic->i_ack == NULL) {
315 ret = -ENOMEM;
316 rdsdebug("ib_dma_alloc_coherent ack failed\n");
317 goto out;
318 }
319
320 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
321 if (ic->i_sends == NULL) {
322 ret = -ENOMEM;
323 rdsdebug("send allocation failed\n");
324 goto out;
325 }
326 rds_ib_send_init_ring(ic);
327
328 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
329 if (ic->i_recvs == NULL) {
330 ret = -ENOMEM;
331 rdsdebug("recv allocation failed\n");
332 goto out;
333 }
334
335 rds_ib_recv_init_ring(ic);
336 rds_ib_recv_init_ack(ic);
337
338 /* Post receive buffers - as a side effect, this will update
339 * the posted credit count. */
340 rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
341
342 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
343 ic->i_send_cq, ic->i_recv_cq);
344
345out:
346 return ret;
347}
348
349static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
350{
351 u16 common;
352 u32 version = 0;
353
354 /* rdma_cm private data is odd - when there is any private data in the
355 * request, we will be given a pretty large buffer without telling us the
356 * original size. The only way to tell the difference is by looking at
357 * the contents, which are initialized to zero.
358 * If the protocol version fields aren't set, this is a connection attempt
359 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
360 * We really should have changed this for OFED 1.3 :-( */
361 if (dp->dp_protocol_major == 0)
362 return RDS_PROTOCOL_3_0;
363
364 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
365 if (dp->dp_protocol_major == 3 && common) {
366 version = RDS_PROTOCOL_3_0;
367 while ((common >>= 1) != 0)
368 version++;
369 } else if (printk_ratelimit()) {
370 printk(KERN_NOTICE "RDS: Connection from %pI4 using "
371 "incompatible protocol version %u.%u\n",
372 &dp->dp_saddr,
373 dp->dp_protocol_major,
374 dp->dp_protocol_minor);
375 }
376 return version;
377}
378
379int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
380 struct rdma_cm_event *event)
381{
382 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
383 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
384 const struct rds_ib_connect_private *dp = event->param.conn.private_data;
385 struct rds_ib_connect_private dp_rep;
386 struct rds_connection *conn = NULL;
387 struct rds_ib_connection *ic = NULL;
388 struct rdma_conn_param conn_param;
389 u32 version;
390 int err, destroy = 1;
391
392 /* Check whether the remote protocol version matches ours. */
393 version = rds_ib_protocol_compatible(dp);
394 if (!version)
395 goto out;
396
397 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
398 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
399 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
400 (unsigned long long)be64_to_cpu(lguid),
401 (unsigned long long)be64_to_cpu(fguid));
402
403 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
404 GFP_KERNEL);
405 if (IS_ERR(conn)) {
406 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
407 conn = NULL;
408 goto out;
409 }
410
411 /*
412 * The connection request may occur while the
413 * previous connection exist, e.g. in case of failover.
414 * But as connections may be initiated simultaneously
415 * by both hosts, we have a random backoff mechanism -
416 * see the comment above rds_queue_reconnect()
417 */
418 mutex_lock(&conn->c_cm_lock);
419 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
420 if (rds_conn_state(conn) == RDS_CONN_UP) {
421 rdsdebug("incoming connect while connecting\n");
422 rds_conn_drop(conn);
423 rds_ib_stats_inc(s_ib_listen_closed_stale);
424 } else
425 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
426 /* Wait and see - our connect may still be succeeding */
427 rds_ib_stats_inc(s_ib_connect_raced);
428 }
429 mutex_unlock(&conn->c_cm_lock);
430 goto out;
431 }
432
433 ic = conn->c_transport_data;
434
435 rds_ib_set_protocol(conn, version);
436 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
437
438 /* If the peer gave us the last packet it saw, process this as if
439 * we had received a regular ACK. */
440 if (dp->dp_ack_seq)
441 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
442
443 BUG_ON(cm_id->context);
444 BUG_ON(ic->i_cm_id);
445
446 ic->i_cm_id = cm_id;
447 cm_id->context = conn;
448
449 /* We got halfway through setting up the ib_connection, if we
450 * fail now, we have to take the long route out of this mess. */
451 destroy = 0;
452
453 err = rds_ib_setup_qp(conn);
454 if (err) {
455 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
456 goto out;
457 }
458
459 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
460
461 /* rdma_accept() calls rdma_reject() internally if it fails */
462 err = rdma_accept(cm_id, &conn_param);
463 mutex_unlock(&conn->c_cm_lock);
464 if (err) {
465 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
466 goto out;
467 }
468
469 return 0;
470
471out:
472 rdma_reject(cm_id, NULL, 0);
473 return destroy;
474}
475
476
477int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
478{
479 struct rds_connection *conn = cm_id->context;
480 struct rds_ib_connection *ic = conn->c_transport_data;
481 struct rdma_conn_param conn_param;
482 struct rds_ib_connect_private dp;
483 int ret;
484
485 /* If the peer doesn't do protocol negotiation, we must
486 * default to RDSv3.0 */
487 rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
488 ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
489
490 ret = rds_ib_setup_qp(conn);
491 if (ret) {
492 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
493 goto out;
494 }
495
496 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
497
498 ret = rdma_connect(cm_id, &conn_param);
499 if (ret)
500 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
501
502out:
503 /* Beware - returning non-zero tells the rdma_cm to destroy
504 * the cm_id. We should certainly not do it as long as we still
505 * "own" the cm_id. */
506 if (ret) {
507 if (ic->i_cm_id == cm_id)
508 ret = 0;
509 }
510 return ret;
511}
512
513int rds_ib_conn_connect(struct rds_connection *conn)
514{
515 struct rds_ib_connection *ic = conn->c_transport_data;
516 struct sockaddr_in src, dest;
517 int ret;
518
519 /* XXX I wonder what affect the port space has */
520 /* delegate cm event handler to rdma_transport */
521 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
522 RDMA_PS_TCP);
523 if (IS_ERR(ic->i_cm_id)) {
524 ret = PTR_ERR(ic->i_cm_id);
525 ic->i_cm_id = NULL;
526 rdsdebug("rdma_create_id() failed: %d\n", ret);
527 goto out;
528 }
529
530 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
531
532 src.sin_family = AF_INET;
533 src.sin_addr.s_addr = (__force u32)conn->c_laddr;
534 src.sin_port = (__force u16)htons(0);
535
536 dest.sin_family = AF_INET;
537 dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
538 dest.sin_port = (__force u16)htons(RDS_PORT);
539
540 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
541 (struct sockaddr *)&dest,
542 RDS_RDMA_RESOLVE_TIMEOUT_MS);
543 if (ret) {
544 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
545 ret);
546 rdma_destroy_id(ic->i_cm_id);
547 ic->i_cm_id = NULL;
548 }
549
550out:
551 return ret;
552}
553
554/*
555 * This is so careful about only cleaning up resources that were built up
556 * so that it can be called at any point during startup. In fact it
557 * can be called multiple times for a given connection.
558 */
559void rds_ib_conn_shutdown(struct rds_connection *conn)
560{
561 struct rds_ib_connection *ic = conn->c_transport_data;
562 int err = 0;
563
564 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
565 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
566 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
567
568 if (ic->i_cm_id) {
569 struct ib_device *dev = ic->i_cm_id->device;
570
571 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
572 err = rdma_disconnect(ic->i_cm_id);
573 if (err) {
574 /* Actually this may happen quite frequently, when
575 * an outgoing connect raced with an incoming connect.
576 */
577 rdsdebug("failed to disconnect, cm: %p err %d\n",
578 ic->i_cm_id, err);
579 }
580
581 wait_event(rds_ib_ring_empty_wait,
582 rds_ib_ring_empty(&ic->i_send_ring) &&
583 rds_ib_ring_empty(&ic->i_recv_ring));
584
585 if (ic->i_send_hdrs)
586 ib_dma_free_coherent(dev,
587 ic->i_send_ring.w_nr *
588 sizeof(struct rds_header),
589 ic->i_send_hdrs,
590 ic->i_send_hdrs_dma);
591
592 if (ic->i_recv_hdrs)
593 ib_dma_free_coherent(dev,
594 ic->i_recv_ring.w_nr *
595 sizeof(struct rds_header),
596 ic->i_recv_hdrs,
597 ic->i_recv_hdrs_dma);
598
599 if (ic->i_ack)
600 ib_dma_free_coherent(dev, sizeof(struct rds_header),
601 ic->i_ack, ic->i_ack_dma);
602
603 if (ic->i_sends)
604 rds_ib_send_clear_ring(ic);
605 if (ic->i_recvs)
606 rds_ib_recv_clear_ring(ic);
607
608 if (ic->i_cm_id->qp)
609 rdma_destroy_qp(ic->i_cm_id);
610 if (ic->i_send_cq)
611 ib_destroy_cq(ic->i_send_cq);
612 if (ic->i_recv_cq)
613 ib_destroy_cq(ic->i_recv_cq);
614 rdma_destroy_id(ic->i_cm_id);
615
616 /*
617 * Move connection back to the nodev list.
618 */
619 if (ic->rds_ibdev) {
620
621 spin_lock_irq(&ic->rds_ibdev->spinlock);
622 BUG_ON(list_empty(&ic->ib_node));
623 list_del(&ic->ib_node);
624 spin_unlock_irq(&ic->rds_ibdev->spinlock);
625
626 spin_lock_irq(&ib_nodev_conns_lock);
627 list_add_tail(&ic->ib_node, &ib_nodev_conns);
628 spin_unlock_irq(&ib_nodev_conns_lock);
629 ic->rds_ibdev = NULL;
630 }
631
632 ic->i_cm_id = NULL;
633 ic->i_pd = NULL;
634 ic->i_mr = NULL;
635 ic->i_send_cq = NULL;
636 ic->i_recv_cq = NULL;
637 ic->i_send_hdrs = NULL;
638 ic->i_recv_hdrs = NULL;
639 ic->i_ack = NULL;
640 }
641 BUG_ON(ic->rds_ibdev);
642
643 /* Clear pending transmit */
644 if (ic->i_rm) {
645 rds_message_put(ic->i_rm);
646 ic->i_rm = NULL;
647 }
648
649 /* Clear the ACK state */
650 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
651 rds_ib_set_64bit(&ic->i_ack_next, 0);
652 ic->i_ack_recv = 0;
653
654 /* Clear flow control state */
655 ic->i_flowctl = 0;
656 atomic_set(&ic->i_credits, 0);
657
658 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
659 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
660
661 if (ic->i_ibinc) {
662 rds_inc_put(&ic->i_ibinc->ii_inc);
663 ic->i_ibinc = NULL;
664 }
665
666 vfree(ic->i_sends);
667 ic->i_sends = NULL;
668 vfree(ic->i_recvs);
669 ic->i_recvs = NULL;
670}
671
672int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
673{
674 struct rds_ib_connection *ic;
675 unsigned long flags;
676
677 /* XXX too lazy? */
678 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
679 if (ic == NULL)
680 return -ENOMEM;
681
682 INIT_LIST_HEAD(&ic->ib_node);
683 mutex_init(&ic->i_recv_mutex);
684
685 /*
686 * rds_ib_conn_shutdown() waits for these to be emptied so they
687 * must be initialized before it can be called.
688 */
689 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
690 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
691
692 ic->conn = conn;
693 conn->c_transport_data = ic;
694
695 spin_lock_irqsave(&ib_nodev_conns_lock, flags);
696 list_add_tail(&ic->ib_node, &ib_nodev_conns);
697 spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
698
699
700 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
701 return 0;
702}
703
704void rds_ib_conn_free(void *arg)
705{
706 struct rds_ib_connection *ic = arg;
707 rdsdebug("ic %p\n", ic);
708 list_del(&ic->ib_node);
709 kfree(ic);
710}
711
712
713/*
714 * An error occurred on the connection
715 */
716void
717__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
718{
719 va_list ap;
720
721 rds_conn_drop(conn);
722
723 va_start(ap, fmt);
724 vprintk(fmt, ap);
725 va_end(ap);
726}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 000000000000..69a6289ed672
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,641 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "rdma.h"
37#include "ib.h"
38
39
40/*
41 * This is stored as mr->r_trans_private.
42 */
43struct rds_ib_mr {
44 struct rds_ib_device *device;
45 struct rds_ib_mr_pool *pool;
46 struct ib_fmr *fmr;
47 struct list_head list;
48 unsigned int remap_count;
49
50 struct scatterlist *sg;
51 unsigned int sg_len;
52 u64 *dma;
53 int sg_dma_len;
54};
55
56/*
57 * Our own little FMR pool
58 */
59struct rds_ib_mr_pool {
60 struct mutex flush_lock; /* serialize fmr invalidate */
61 struct work_struct flush_worker; /* flush worker */
62
63 spinlock_t list_lock; /* protect variables below */
64 atomic_t item_count; /* total # of MRs */
65 atomic_t dirty_count; /* # dirty of MRs */
66 struct list_head drop_list; /* MRs that have reached their max_maps limit */
67 struct list_head free_list; /* unused MRs */
68 struct list_head clean_list; /* unused & unamapped MRs */
69 atomic_t free_pinned; /* memory pinned by free MRs */
70 unsigned long max_items;
71 unsigned long max_items_soft;
72 unsigned long max_free_pinned;
73 struct ib_fmr_attr fmr_attr;
74};
75
76static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
77static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
78static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
79
80static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
81{
82 struct rds_ib_device *rds_ibdev;
83 struct rds_ib_ipaddr *i_ipaddr;
84
85 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
86 spin_lock_irq(&rds_ibdev->spinlock);
87 list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
88 if (i_ipaddr->ipaddr == ipaddr) {
89 spin_unlock_irq(&rds_ibdev->spinlock);
90 return rds_ibdev;
91 }
92 }
93 spin_unlock_irq(&rds_ibdev->spinlock);
94 }
95
96 return NULL;
97}
98
99static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
100{
101 struct rds_ib_ipaddr *i_ipaddr;
102
103 i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
104 if (!i_ipaddr)
105 return -ENOMEM;
106
107 i_ipaddr->ipaddr = ipaddr;
108
109 spin_lock_irq(&rds_ibdev->spinlock);
110 list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
111 spin_unlock_irq(&rds_ibdev->spinlock);
112
113 return 0;
114}
115
116static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
117{
118 struct rds_ib_ipaddr *i_ipaddr, *next;
119
120 spin_lock_irq(&rds_ibdev->spinlock);
121 list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
122 if (i_ipaddr->ipaddr == ipaddr) {
123 list_del(&i_ipaddr->list);
124 kfree(i_ipaddr);
125 break;
126 }
127 }
128 spin_unlock_irq(&rds_ibdev->spinlock);
129}
130
131int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
132{
133 struct rds_ib_device *rds_ibdev_old;
134
135 rds_ibdev_old = rds_ib_get_device(ipaddr);
136 if (rds_ibdev_old)
137 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
138
139 return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
140}
141
142int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
143{
144 struct rds_ib_connection *ic = conn->c_transport_data;
145
146 /* conn was previously on the nodev_conns_list */
147 spin_lock_irq(&ib_nodev_conns_lock);
148 BUG_ON(list_empty(&ib_nodev_conns));
149 BUG_ON(list_empty(&ic->ib_node));
150 list_del(&ic->ib_node);
151 spin_unlock_irq(&ib_nodev_conns_lock);
152
153 spin_lock_irq(&rds_ibdev->spinlock);
154 list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
155 spin_unlock_irq(&rds_ibdev->spinlock);
156
157 ic->rds_ibdev = rds_ibdev;
158
159 return 0;
160}
161
162void rds_ib_remove_nodev_conns(void)
163{
164 struct rds_ib_connection *ic, *_ic;
165 LIST_HEAD(tmp_list);
166
167 /* avoid calling conn_destroy with irqs off */
168 spin_lock_irq(&ib_nodev_conns_lock);
169 list_splice(&ib_nodev_conns, &tmp_list);
170 INIT_LIST_HEAD(&ib_nodev_conns);
171 spin_unlock_irq(&ib_nodev_conns_lock);
172
173 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
174 if (ic->conn->c_passive)
175 rds_conn_destroy(ic->conn->c_passive);
176 rds_conn_destroy(ic->conn);
177 }
178}
179
180void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev)
181{
182 struct rds_ib_connection *ic, *_ic;
183 LIST_HEAD(tmp_list);
184
185 /* avoid calling conn_destroy with irqs off */
186 spin_lock_irq(&rds_ibdev->spinlock);
187 list_splice(&rds_ibdev->conn_list, &tmp_list);
188 INIT_LIST_HEAD(&rds_ibdev->conn_list);
189 spin_unlock_irq(&rds_ibdev->spinlock);
190
191 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
192 if (ic->conn->c_passive)
193 rds_conn_destroy(ic->conn->c_passive);
194 rds_conn_destroy(ic->conn);
195 }
196}
197
198struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
199{
200 struct rds_ib_mr_pool *pool;
201
202 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
203 if (!pool)
204 return ERR_PTR(-ENOMEM);
205
206 INIT_LIST_HEAD(&pool->free_list);
207 INIT_LIST_HEAD(&pool->drop_list);
208 INIT_LIST_HEAD(&pool->clean_list);
209 mutex_init(&pool->flush_lock);
210 spin_lock_init(&pool->list_lock);
211 INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
212
213 pool->fmr_attr.max_pages = fmr_message_size;
214 pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
215 pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
216 pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
217
218 /* We never allow more than max_items MRs to be allocated.
219 * When we exceed more than max_items_soft, we start freeing
220 * items more aggressively.
221 * Make sure that max_items > max_items_soft > max_items / 2
222 */
223 pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
224 pool->max_items = rds_ibdev->max_fmrs;
225
226 return pool;
227}
228
229void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
230{
231 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
232
233 iinfo->rdma_mr_max = pool->max_items;
234 iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
235}
236
237void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
238{
239 flush_workqueue(rds_wq);
240 rds_ib_flush_mr_pool(pool, 1);
241 BUG_ON(atomic_read(&pool->item_count));
242 BUG_ON(atomic_read(&pool->free_pinned));
243 kfree(pool);
244}
245
246static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
247{
248 struct rds_ib_mr *ibmr = NULL;
249 unsigned long flags;
250
251 spin_lock_irqsave(&pool->list_lock, flags);
252 if (!list_empty(&pool->clean_list)) {
253 ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
254 list_del_init(&ibmr->list);
255 }
256 spin_unlock_irqrestore(&pool->list_lock, flags);
257
258 return ibmr;
259}
260
261static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
262{
263 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
264 struct rds_ib_mr *ibmr = NULL;
265 int err = 0, iter = 0;
266
267 while (1) {
268 ibmr = rds_ib_reuse_fmr(pool);
269 if (ibmr)
270 return ibmr;
271
272 /* No clean MRs - now we have the choice of either
273 * allocating a fresh MR up to the limit imposed by the
274 * driver, or flush any dirty unused MRs.
275 * We try to avoid stalling in the send path if possible,
276 * so we allocate as long as we're allowed to.
277 *
278 * We're fussy with enforcing the FMR limit, though. If the driver
279 * tells us we can't use more than N fmrs, we shouldn't start
280 * arguing with it */
281 if (atomic_inc_return(&pool->item_count) <= pool->max_items)
282 break;
283
284 atomic_dec(&pool->item_count);
285
286 if (++iter > 2) {
287 rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
288 return ERR_PTR(-EAGAIN);
289 }
290
291 /* We do have some empty MRs. Flush them out. */
292 rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
293 rds_ib_flush_mr_pool(pool, 0);
294 }
295
296 ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
297 if (!ibmr) {
298 err = -ENOMEM;
299 goto out_no_cigar;
300 }
301
302 ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
303 (IB_ACCESS_LOCAL_WRITE |
304 IB_ACCESS_REMOTE_READ |
305 IB_ACCESS_REMOTE_WRITE),
306 &pool->fmr_attr);
307 if (IS_ERR(ibmr->fmr)) {
308 err = PTR_ERR(ibmr->fmr);
309 ibmr->fmr = NULL;
310 printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
311 goto out_no_cigar;
312 }
313
314 rds_ib_stats_inc(s_ib_rdma_mr_alloc);
315 return ibmr;
316
317out_no_cigar:
318 if (ibmr) {
319 if (ibmr->fmr)
320 ib_dealloc_fmr(ibmr->fmr);
321 kfree(ibmr);
322 }
323 atomic_dec(&pool->item_count);
324 return ERR_PTR(err);
325}
326
327static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
328 struct scatterlist *sg, unsigned int nents)
329{
330 struct ib_device *dev = rds_ibdev->dev;
331 struct scatterlist *scat = sg;
332 u64 io_addr = 0;
333 u64 *dma_pages;
334 u32 len;
335 int page_cnt, sg_dma_len;
336 int i, j;
337 int ret;
338
339 sg_dma_len = ib_dma_map_sg(dev, sg, nents,
340 DMA_BIDIRECTIONAL);
341 if (unlikely(!sg_dma_len)) {
342 printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
343 return -EBUSY;
344 }
345
346 len = 0;
347 page_cnt = 0;
348
349 for (i = 0; i < sg_dma_len; ++i) {
350 unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
351 u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
352
353 if (dma_addr & ~rds_ibdev->fmr_page_mask) {
354 if (i > 0)
355 return -EINVAL;
356 else
357 ++page_cnt;
358 }
359 if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
360 if (i < sg_dma_len - 1)
361 return -EINVAL;
362 else
363 ++page_cnt;
364 }
365
366 len += dma_len;
367 }
368
369 page_cnt += len >> rds_ibdev->fmr_page_shift;
370 if (page_cnt > fmr_message_size)
371 return -EINVAL;
372
373 dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
374 if (!dma_pages)
375 return -ENOMEM;
376
377 page_cnt = 0;
378 for (i = 0; i < sg_dma_len; ++i) {
379 unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
380 u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
381
382 for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
383 dma_pages[page_cnt++] =
384 (dma_addr & rds_ibdev->fmr_page_mask) + j;
385 }
386
387 ret = ib_map_phys_fmr(ibmr->fmr,
388 dma_pages, page_cnt, io_addr);
389 if (ret)
390 goto out;
391
392 /* Success - we successfully remapped the MR, so we can
393 * safely tear down the old mapping. */
394 rds_ib_teardown_mr(ibmr);
395
396 ibmr->sg = scat;
397 ibmr->sg_len = nents;
398 ibmr->sg_dma_len = sg_dma_len;
399 ibmr->remap_count++;
400
401 rds_ib_stats_inc(s_ib_rdma_mr_used);
402 ret = 0;
403
404out:
405 kfree(dma_pages);
406
407 return ret;
408}
409
410void rds_ib_sync_mr(void *trans_private, int direction)
411{
412 struct rds_ib_mr *ibmr = trans_private;
413 struct rds_ib_device *rds_ibdev = ibmr->device;
414
415 switch (direction) {
416 case DMA_FROM_DEVICE:
417 ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
418 ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
419 break;
420 case DMA_TO_DEVICE:
421 ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
422 ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
423 break;
424 }
425}
426
427static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
428{
429 struct rds_ib_device *rds_ibdev = ibmr->device;
430
431 if (ibmr->sg_dma_len) {
432 ib_dma_unmap_sg(rds_ibdev->dev,
433 ibmr->sg, ibmr->sg_len,
434 DMA_BIDIRECTIONAL);
435 ibmr->sg_dma_len = 0;
436 }
437
438 /* Release the s/g list */
439 if (ibmr->sg_len) {
440 unsigned int i;
441
442 for (i = 0; i < ibmr->sg_len; ++i) {
443 struct page *page = sg_page(&ibmr->sg[i]);
444
445 /* FIXME we need a way to tell a r/w MR
446 * from a r/o MR */
447 set_page_dirty(page);
448 put_page(page);
449 }
450 kfree(ibmr->sg);
451
452 ibmr->sg = NULL;
453 ibmr->sg_len = 0;
454 }
455}
456
457static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
458{
459 unsigned int pinned = ibmr->sg_len;
460
461 __rds_ib_teardown_mr(ibmr);
462 if (pinned) {
463 struct rds_ib_device *rds_ibdev = ibmr->device;
464 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
465
466 atomic_sub(pinned, &pool->free_pinned);
467 }
468}
469
470static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
471{
472 unsigned int item_count;
473
474 item_count = atomic_read(&pool->item_count);
475 if (free_all)
476 return item_count;
477
478 return 0;
479}
480
481/*
482 * Flush our pool of MRs.
483 * At a minimum, all currently unused MRs are unmapped.
484 * If the number of MRs allocated exceeds the limit, we also try
485 * to free as many MRs as needed to get back to this limit.
486 */
487static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
488{
489 struct rds_ib_mr *ibmr, *next;
490 LIST_HEAD(unmap_list);
491 LIST_HEAD(fmr_list);
492 unsigned long unpinned = 0;
493 unsigned long flags;
494 unsigned int nfreed = 0, ncleaned = 0, free_goal;
495 int ret = 0;
496
497 rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
498
499 mutex_lock(&pool->flush_lock);
500
501 spin_lock_irqsave(&pool->list_lock, flags);
502 /* Get the list of all MRs to be dropped. Ordering matters -
503 * we want to put drop_list ahead of free_list. */
504 list_splice_init(&pool->free_list, &unmap_list);
505 list_splice_init(&pool->drop_list, &unmap_list);
506 if (free_all)
507 list_splice_init(&pool->clean_list, &unmap_list);
508 spin_unlock_irqrestore(&pool->list_lock, flags);
509
510 free_goal = rds_ib_flush_goal(pool, free_all);
511
512 if (list_empty(&unmap_list))
513 goto out;
514
515 /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
516 list_for_each_entry(ibmr, &unmap_list, list)
517 list_add(&ibmr->fmr->list, &fmr_list);
518 ret = ib_unmap_fmr(&fmr_list);
519 if (ret)
520 printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
521
522 /* Now we can destroy the DMA mapping and unpin any pages */
523 list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
524 unpinned += ibmr->sg_len;
525 __rds_ib_teardown_mr(ibmr);
526 if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
527 rds_ib_stats_inc(s_ib_rdma_mr_free);
528 list_del(&ibmr->list);
529 ib_dealloc_fmr(ibmr->fmr);
530 kfree(ibmr);
531 nfreed++;
532 }
533 ncleaned++;
534 }
535
536 spin_lock_irqsave(&pool->list_lock, flags);
537 list_splice(&unmap_list, &pool->clean_list);
538 spin_unlock_irqrestore(&pool->list_lock, flags);
539
540 atomic_sub(unpinned, &pool->free_pinned);
541 atomic_sub(ncleaned, &pool->dirty_count);
542 atomic_sub(nfreed, &pool->item_count);
543
544out:
545 mutex_unlock(&pool->flush_lock);
546 return ret;
547}
548
549static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
550{
551 struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
552
553 rds_ib_flush_mr_pool(pool, 0);
554}
555
556void rds_ib_free_mr(void *trans_private, int invalidate)
557{
558 struct rds_ib_mr *ibmr = trans_private;
559 struct rds_ib_device *rds_ibdev = ibmr->device;
560 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
561 unsigned long flags;
562
563 rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
564
565 /* Return it to the pool's free list */
566 spin_lock_irqsave(&pool->list_lock, flags);
567 if (ibmr->remap_count >= pool->fmr_attr.max_maps)
568 list_add(&ibmr->list, &pool->drop_list);
569 else
570 list_add(&ibmr->list, &pool->free_list);
571
572 atomic_add(ibmr->sg_len, &pool->free_pinned);
573 atomic_inc(&pool->dirty_count);
574 spin_unlock_irqrestore(&pool->list_lock, flags);
575
576 /* If we've pinned too many pages, request a flush */
577 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
578 || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
579 queue_work(rds_wq, &pool->flush_worker);
580
581 if (invalidate) {
582 if (likely(!in_interrupt())) {
583 rds_ib_flush_mr_pool(pool, 0);
584 } else {
585 /* We get here if the user created a MR marked
586 * as use_once and invalidate at the same time. */
587 queue_work(rds_wq, &pool->flush_worker);
588 }
589 }
590}
591
592void rds_ib_flush_mrs(void)
593{
594 struct rds_ib_device *rds_ibdev;
595
596 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
597 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
598
599 if (pool)
600 rds_ib_flush_mr_pool(pool, 0);
601 }
602}
603
604void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
605 struct rds_sock *rs, u32 *key_ret)
606{
607 struct rds_ib_device *rds_ibdev;
608 struct rds_ib_mr *ibmr = NULL;
609 int ret;
610
611 rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
612 if (!rds_ibdev) {
613 ret = -ENODEV;
614 goto out;
615 }
616
617 if (!rds_ibdev->mr_pool) {
618 ret = -ENODEV;
619 goto out;
620 }
621
622 ibmr = rds_ib_alloc_fmr(rds_ibdev);
623 if (IS_ERR(ibmr))
624 return ibmr;
625
626 ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
627 if (ret == 0)
628 *key_ret = ibmr->fmr->rkey;
629 else
630 printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
631
632 ibmr->device = rds_ibdev;
633
634 out:
635 if (ret) {
636 if (ibmr)
637 rds_ib_free_mr(ibmr, 0);
638 ibmr = ERR_PTR(ret);
639 }
640 return ibmr;
641}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 000000000000..5061b5502162
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,869 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/pci.h>
35#include <linux/dma-mapping.h>
36#include <rdma/rdma_cm.h>
37
38#include "rds.h"
39#include "ib.h"
40
41static struct kmem_cache *rds_ib_incoming_slab;
42static struct kmem_cache *rds_ib_frag_slab;
43static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
44
45static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
46{
47 rdsdebug("frag %p page %p\n", frag, frag->f_page);
48 __free_page(frag->f_page);
49 frag->f_page = NULL;
50}
51
52static void rds_ib_frag_free(struct rds_page_frag *frag)
53{
54 rdsdebug("frag %p page %p\n", frag, frag->f_page);
55 BUG_ON(frag->f_page != NULL);
56 kmem_cache_free(rds_ib_frag_slab, frag);
57}
58
59/*
60 * We map a page at a time. Its fragments are posted in order. This
61 * is called in fragment order as the fragments get send completion events.
62 * Only the last frag in the page performs the unmapping.
63 *
64 * It's OK for ring cleanup to call this in whatever order it likes because
65 * DMA is not in flight and so we can unmap while other ring entries still
66 * hold page references in their frags.
67 */
68static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
69 struct rds_ib_recv_work *recv)
70{
71 struct rds_page_frag *frag = recv->r_frag;
72
73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
74 if (frag->f_mapped)
75 ib_dma_unmap_page(ic->i_cm_id->device,
76 frag->f_mapped,
77 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
78 frag->f_mapped = 0;
79}
80
81void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
82{
83 struct rds_ib_recv_work *recv;
84 u32 i;
85
86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
87 struct ib_sge *sge;
88
89 recv->r_ibinc = NULL;
90 recv->r_frag = NULL;
91
92 recv->r_wr.next = NULL;
93 recv->r_wr.wr_id = i;
94 recv->r_wr.sg_list = recv->r_sge;
95 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
96
97 sge = rds_ib_data_sge(ic, recv->r_sge);
98 sge->addr = 0;
99 sge->length = RDS_FRAG_SIZE;
100 sge->lkey = ic->i_mr->lkey;
101
102 sge = rds_ib_header_sge(ic, recv->r_sge);
103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
104 sge->length = sizeof(struct rds_header);
105 sge->lkey = ic->i_mr->lkey;
106 }
107}
108
109static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
110 struct rds_ib_recv_work *recv)
111{
112 if (recv->r_ibinc) {
113 rds_inc_put(&recv->r_ibinc->ii_inc);
114 recv->r_ibinc = NULL;
115 }
116 if (recv->r_frag) {
117 rds_ib_recv_unmap_page(ic, recv);
118 if (recv->r_frag->f_page)
119 rds_ib_frag_drop_page(recv->r_frag);
120 rds_ib_frag_free(recv->r_frag);
121 recv->r_frag = NULL;
122 }
123}
124
125void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
126{
127 u32 i;
128
129 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
130 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
131
132 if (ic->i_frag.f_page)
133 rds_ib_frag_drop_page(&ic->i_frag);
134}
135
136static int rds_ib_recv_refill_one(struct rds_connection *conn,
137 struct rds_ib_recv_work *recv,
138 gfp_t kptr_gfp, gfp_t page_gfp)
139{
140 struct rds_ib_connection *ic = conn->c_transport_data;
141 dma_addr_t dma_addr;
142 struct ib_sge *sge;
143 int ret = -ENOMEM;
144
145 if (recv->r_ibinc == NULL) {
146 if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
147 rds_ib_stats_inc(s_ib_rx_alloc_limit);
148 goto out;
149 }
150 recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
151 kptr_gfp);
152 if (recv->r_ibinc == NULL)
153 goto out;
154 atomic_inc(&rds_ib_allocation);
155 INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
156 rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
157 }
158
159 if (recv->r_frag == NULL) {
160 recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
161 if (recv->r_frag == NULL)
162 goto out;
163 INIT_LIST_HEAD(&recv->r_frag->f_item);
164 recv->r_frag->f_page = NULL;
165 }
166
167 if (ic->i_frag.f_page == NULL) {
168 ic->i_frag.f_page = alloc_page(page_gfp);
169 if (ic->i_frag.f_page == NULL)
170 goto out;
171 ic->i_frag.f_offset = 0;
172 }
173
174 dma_addr = ib_dma_map_page(ic->i_cm_id->device,
175 ic->i_frag.f_page,
176 ic->i_frag.f_offset,
177 RDS_FRAG_SIZE,
178 DMA_FROM_DEVICE);
179 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
180 goto out;
181
182 /*
183 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
184 * must be called on this recv. This happens as completions hit
185 * in order or on connection shutdown.
186 */
187 recv->r_frag->f_page = ic->i_frag.f_page;
188 recv->r_frag->f_offset = ic->i_frag.f_offset;
189 recv->r_frag->f_mapped = dma_addr;
190
191 sge = rds_ib_data_sge(ic, recv->r_sge);
192 sge->addr = dma_addr;
193 sge->length = RDS_FRAG_SIZE;
194
195 sge = rds_ib_header_sge(ic, recv->r_sge);
196 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
197 sge->length = sizeof(struct rds_header);
198
199 get_page(recv->r_frag->f_page);
200
201 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
202 ic->i_frag.f_offset += RDS_FRAG_SIZE;
203 } else {
204 put_page(ic->i_frag.f_page);
205 ic->i_frag.f_page = NULL;
206 ic->i_frag.f_offset = 0;
207 }
208
209 ret = 0;
210out:
211 return ret;
212}
213
214/*
215 * This tries to allocate and post unused work requests after making sure that
216 * they have all the allocations they need to queue received fragments into
217 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
218 * pairs don't go unmatched.
219 *
220 * -1 is returned if posting fails due to temporary resource exhaustion.
221 */
222int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
223 gfp_t page_gfp, int prefill)
224{
225 struct rds_ib_connection *ic = conn->c_transport_data;
226 struct rds_ib_recv_work *recv;
227 struct ib_recv_wr *failed_wr;
228 unsigned int posted = 0;
229 int ret = 0;
230 u32 pos;
231
232 while ((prefill || rds_conn_up(conn))
233 && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
234 if (pos >= ic->i_recv_ring.w_nr) {
235 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
236 pos);
237 ret = -EINVAL;
238 break;
239 }
240
241 recv = &ic->i_recvs[pos];
242 ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
243 if (ret) {
244 ret = -1;
245 break;
246 }
247
248 /* XXX when can this fail? */
249 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
250 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
251 recv->r_ibinc, recv->r_frag->f_page,
252 (long) recv->r_frag->f_mapped, ret);
253 if (ret) {
254 rds_ib_conn_error(conn, "recv post on "
255 "%pI4 returned %d, disconnecting and "
256 "reconnecting\n", &conn->c_faddr,
257 ret);
258 ret = -1;
259 break;
260 }
261
262 posted++;
263 }
264
265 /* We're doing flow control - update the window. */
266 if (ic->i_flowctl && posted)
267 rds_ib_advertise_credits(conn, posted);
268
269 if (ret)
270 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
271 return ret;
272}
273
274void rds_ib_inc_purge(struct rds_incoming *inc)
275{
276 struct rds_ib_incoming *ibinc;
277 struct rds_page_frag *frag;
278 struct rds_page_frag *pos;
279
280 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
281 rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
282
283 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
284 list_del_init(&frag->f_item);
285 rds_ib_frag_drop_page(frag);
286 rds_ib_frag_free(frag);
287 }
288}
289
290void rds_ib_inc_free(struct rds_incoming *inc)
291{
292 struct rds_ib_incoming *ibinc;
293
294 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
295
296 rds_ib_inc_purge(inc);
297 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
298 BUG_ON(!list_empty(&ibinc->ii_frags));
299 kmem_cache_free(rds_ib_incoming_slab, ibinc);
300 atomic_dec(&rds_ib_allocation);
301 BUG_ON(atomic_read(&rds_ib_allocation) < 0);
302}
303
304int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
305 size_t size)
306{
307 struct rds_ib_incoming *ibinc;
308 struct rds_page_frag *frag;
309 struct iovec *iov = first_iov;
310 unsigned long to_copy;
311 unsigned long frag_off = 0;
312 unsigned long iov_off = 0;
313 int copied = 0;
314 int ret;
315 u32 len;
316
317 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
318 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
319 len = be32_to_cpu(inc->i_hdr.h_len);
320
321 while (copied < size && copied < len) {
322 if (frag_off == RDS_FRAG_SIZE) {
323 frag = list_entry(frag->f_item.next,
324 struct rds_page_frag, f_item);
325 frag_off = 0;
326 }
327 while (iov_off == iov->iov_len) {
328 iov_off = 0;
329 iov++;
330 }
331
332 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
333 to_copy = min_t(size_t, to_copy, size - copied);
334 to_copy = min_t(unsigned long, to_copy, len - copied);
335
336 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
337 "[%p, %lu] + %lu\n",
338 to_copy, iov->iov_base, iov->iov_len, iov_off,
339 frag->f_page, frag->f_offset, frag_off);
340
341 /* XXX needs + offset for multiple recvs per page */
342 ret = rds_page_copy_to_user(frag->f_page,
343 frag->f_offset + frag_off,
344 iov->iov_base + iov_off,
345 to_copy);
346 if (ret) {
347 copied = ret;
348 break;
349 }
350
351 iov_off += to_copy;
352 frag_off += to_copy;
353 copied += to_copy;
354 }
355
356 return copied;
357}
358
359/* ic starts out kzalloc()ed */
360void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
361{
362 struct ib_send_wr *wr = &ic->i_ack_wr;
363 struct ib_sge *sge = &ic->i_ack_sge;
364
365 sge->addr = ic->i_ack_dma;
366 sge->length = sizeof(struct rds_header);
367 sge->lkey = ic->i_mr->lkey;
368
369 wr->sg_list = sge;
370 wr->num_sge = 1;
371 wr->opcode = IB_WR_SEND;
372 wr->wr_id = RDS_IB_ACK_WR_ID;
373 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
374}
375
376/*
377 * You'd think that with reliable IB connections you wouldn't need to ack
378 * messages that have been received. The problem is that IB hardware generates
379 * an ack message before it has DMAed the message into memory. This creates a
380 * potential message loss if the HCA is disabled for any reason between when it
381 * sends the ack and before the message is DMAed and processed. This is only a
382 * potential issue if another HCA is available for fail-over.
383 *
384 * When the remote host receives our ack they'll free the sent message from
385 * their send queue. To decrease the latency of this we always send an ack
386 * immediately after we've received messages.
387 *
388 * For simplicity, we only have one ack in flight at a time. This puts
389 * pressure on senders to have deep enough send queues to absorb the latency of
390 * a single ack frame being in flight. This might not be good enough.
391 *
392 * This is implemented by have a long-lived send_wr and sge which point to a
393 * statically allocated ack frame. This ack wr does not fall under the ring
394 * accounting that the tx and rx wrs do. The QP attribute specifically makes
395 * room for it beyond the ring size. Send completion notices its special
396 * wr_id and avoids working with the ring in that case.
397 */
398static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
399 int ack_required)
400{
401 rds_ib_set_64bit(&ic->i_ack_next, seq);
402 if (ack_required) {
403 smp_mb__before_clear_bit();
404 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
405 }
406}
407
408static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
409{
410 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
411 smp_mb__after_clear_bit();
412
413 return ic->i_ack_next;
414}
415
416static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
417{
418 struct rds_header *hdr = ic->i_ack;
419 struct ib_send_wr *failed_wr;
420 u64 seq;
421 int ret;
422
423 seq = rds_ib_get_ack(ic);
424
425 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
426 rds_message_populate_header(hdr, 0, 0, 0);
427 hdr->h_ack = cpu_to_be64(seq);
428 hdr->h_credit = adv_credits;
429 rds_message_make_checksum(hdr);
430 ic->i_ack_queued = jiffies;
431
432 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
433 if (unlikely(ret)) {
434 /* Failed to send. Release the WR, and
435 * force another ACK.
436 */
437 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
438 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
439
440 rds_ib_stats_inc(s_ib_ack_send_failure);
441 /* Need to finesse this later. */
442 BUG();
443 } else
444 rds_ib_stats_inc(s_ib_ack_sent);
445}
446
447/*
448 * There are 3 ways of getting acknowledgements to the peer:
449 * 1. We call rds_ib_attempt_ack from the recv completion handler
450 * to send an ACK-only frame.
451 * However, there can be only one such frame in the send queue
452 * at any time, so we may have to postpone it.
453 * 2. When another (data) packet is transmitted while there's
454 * an ACK in the queue, we piggyback the ACK sequence number
455 * on the data packet.
456 * 3. If the ACK WR is done sending, we get called from the
457 * send queue completion handler, and check whether there's
458 * another ACK pending (postponed because the WR was on the
459 * queue). If so, we transmit it.
460 *
461 * We maintain 2 variables:
462 * - i_ack_flags, which keeps track of whether the ACK WR
463 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
464 * - i_ack_next, which is the last sequence number we received
465 *
466 * Potentially, send queue and receive queue handlers can run concurrently.
467 *
468 * Reconnecting complicates this picture just slightly. When we
469 * reconnect, we may be seeing duplicate packets. The peer
470 * is retransmitting them, because it hasn't seen an ACK for
471 * them. It is important that we ACK these.
472 *
473 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
474 * this flag set *MUST* be acknowledged immediately.
475 */
476
477/*
478 * When we get here, we're called from the recv queue handler.
479 * Check whether we ought to transmit an ACK.
480 */
481void rds_ib_attempt_ack(struct rds_ib_connection *ic)
482{
483 unsigned int adv_credits;
484
485 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
486 return;
487
488 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
489 rds_ib_stats_inc(s_ib_ack_send_delayed);
490 return;
491 }
492
493 /* Can we get a send credit? */
494 if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
495 rds_ib_stats_inc(s_ib_tx_throttle);
496 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
497 return;
498 }
499
500 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
501 rds_ib_send_ack(ic, adv_credits);
502}
503
504/*
505 * We get here from the send completion handler, when the
506 * adapter tells us the ACK frame was sent.
507 */
508void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
509{
510 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
511 rds_ib_attempt_ack(ic);
512}
513
514/*
515 * This is called by the regular xmit code when it wants to piggyback
516 * an ACK on an outgoing frame.
517 */
518u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
519{
520 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
521 rds_ib_stats_inc(s_ib_ack_send_piggybacked);
522 return rds_ib_get_ack(ic);
523}
524
525/*
526 * It's kind of lame that we're copying from the posted receive pages into
527 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
528 * them. But receiving new congestion bitmaps should be a *rare* event, so
529 * hopefully we won't need to invest that complexity in making it more
530 * efficient. By copying we can share a simpler core with TCP which has to
531 * copy.
532 */
533static void rds_ib_cong_recv(struct rds_connection *conn,
534 struct rds_ib_incoming *ibinc)
535{
536 struct rds_cong_map *map;
537 unsigned int map_off;
538 unsigned int map_page;
539 struct rds_page_frag *frag;
540 unsigned long frag_off;
541 unsigned long to_copy;
542 unsigned long copied;
543 uint64_t uncongested = 0;
544 void *addr;
545
546 /* catch completely corrupt packets */
547 if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
548 return;
549
550 map = conn->c_fcong;
551 map_page = 0;
552 map_off = 0;
553
554 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
555 frag_off = 0;
556
557 copied = 0;
558
559 while (copied < RDS_CONG_MAP_BYTES) {
560 uint64_t *src, *dst;
561 unsigned int k;
562
563 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
564 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
565
566 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
567
568 src = addr + frag_off;
569 dst = (void *)map->m_page_addrs[map_page] + map_off;
570 for (k = 0; k < to_copy; k += 8) {
571 /* Record ports that became uncongested, ie
572 * bits that changed from 0 to 1. */
573 uncongested |= ~(*src) & *dst;
574 *dst++ = *src++;
575 }
576 kunmap_atomic(addr, KM_SOFTIRQ0);
577
578 copied += to_copy;
579
580 map_off += to_copy;
581 if (map_off == PAGE_SIZE) {
582 map_off = 0;
583 map_page++;
584 }
585
586 frag_off += to_copy;
587 if (frag_off == RDS_FRAG_SIZE) {
588 frag = list_entry(frag->f_item.next,
589 struct rds_page_frag, f_item);
590 frag_off = 0;
591 }
592 }
593
594 /* the congestion map is in little endian order */
595 uncongested = le64_to_cpu(uncongested);
596
597 rds_cong_map_updated(map, uncongested);
598}
599
600/*
601 * Rings are posted with all the allocations they'll need to queue the
602 * incoming message to the receiving socket so this can't fail.
603 * All fragments start with a header, so we can make sure we're not receiving
604 * garbage, and we can tell a small 8 byte fragment from an ACK frame.
605 */
606struct rds_ib_ack_state {
607 u64 ack_next;
608 u64 ack_recv;
609 unsigned int ack_required:1;
610 unsigned int ack_next_valid:1;
611 unsigned int ack_recv_valid:1;
612};
613
614static void rds_ib_process_recv(struct rds_connection *conn,
615 struct rds_ib_recv_work *recv, u32 byte_len,
616 struct rds_ib_ack_state *state)
617{
618 struct rds_ib_connection *ic = conn->c_transport_data;
619 struct rds_ib_incoming *ibinc = ic->i_ibinc;
620 struct rds_header *ihdr, *hdr;
621
622 /* XXX shut down the connection if port 0,0 are seen? */
623
624 rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
625 byte_len);
626
627 if (byte_len < sizeof(struct rds_header)) {
628 rds_ib_conn_error(conn, "incoming message "
629 "from %pI4 didn't inclue a "
630 "header, disconnecting and "
631 "reconnecting\n",
632 &conn->c_faddr);
633 return;
634 }
635 byte_len -= sizeof(struct rds_header);
636
637 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
638
639 /* Validate the checksum. */
640 if (!rds_message_verify_checksum(ihdr)) {
641 rds_ib_conn_error(conn, "incoming message "
642 "from %pI4 has corrupted header - "
643 "forcing a reconnect\n",
644 &conn->c_faddr);
645 rds_stats_inc(s_recv_drop_bad_checksum);
646 return;
647 }
648
649 /* Process the ACK sequence which comes with every packet */
650 state->ack_recv = be64_to_cpu(ihdr->h_ack);
651 state->ack_recv_valid = 1;
652
653 /* Process the credits update if there was one */
654 if (ihdr->h_credit)
655 rds_ib_send_add_credits(conn, ihdr->h_credit);
656
657 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
658 /* This is an ACK-only packet. The fact that it gets
659 * special treatment here is that historically, ACKs
660 * were rather special beasts.
661 */
662 rds_ib_stats_inc(s_ib_ack_received);
663
664 /*
665 * Usually the frags make their way on to incs and are then freed as
666 * the inc is freed. We don't go that route, so we have to drop the
667 * page ref ourselves. We can't just leave the page on the recv
668 * because that confuses the dma mapping of pages and each recv's use
669 * of a partial page. We can leave the frag, though, it will be
670 * reused.
671 *
672 * FIXME: Fold this into the code path below.
673 */
674 rds_ib_frag_drop_page(recv->r_frag);
675 return;
676 }
677
678 /*
679 * If we don't already have an inc on the connection then this
680 * fragment has a header and starts a message.. copy its header
681 * into the inc and save the inc so we can hang upcoming fragments
682 * off its list.
683 */
684 if (ibinc == NULL) {
685 ibinc = recv->r_ibinc;
686 recv->r_ibinc = NULL;
687 ic->i_ibinc = ibinc;
688
689 hdr = &ibinc->ii_inc.i_hdr;
690 memcpy(hdr, ihdr, sizeof(*hdr));
691 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
692
693 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
694 ic->i_recv_data_rem, hdr->h_flags);
695 } else {
696 hdr = &ibinc->ii_inc.i_hdr;
697 /* We can't just use memcmp here; fragments of a
698 * single message may carry different ACKs */
699 if (hdr->h_sequence != ihdr->h_sequence
700 || hdr->h_len != ihdr->h_len
701 || hdr->h_sport != ihdr->h_sport
702 || hdr->h_dport != ihdr->h_dport) {
703 rds_ib_conn_error(conn,
704 "fragment header mismatch; forcing reconnect\n");
705 return;
706 }
707 }
708
709 list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
710 recv->r_frag = NULL;
711
712 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
713 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
714 else {
715 ic->i_recv_data_rem = 0;
716 ic->i_ibinc = NULL;
717
718 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
719 rds_ib_cong_recv(conn, ibinc);
720 else {
721 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
722 &ibinc->ii_inc, GFP_ATOMIC,
723 KM_SOFTIRQ0);
724 state->ack_next = be64_to_cpu(hdr->h_sequence);
725 state->ack_next_valid = 1;
726 }
727
728 /* Evaluate the ACK_REQUIRED flag *after* we received
729 * the complete frame, and after bumping the next_rx
730 * sequence. */
731 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
732 rds_stats_inc(s_recv_ack_required);
733 state->ack_required = 1;
734 }
735
736 rds_inc_put(&ibinc->ii_inc);
737 }
738}
739
740/*
741 * Plucking the oldest entry from the ring can be done concurrently with
742 * the thread refilling the ring. Each ring operation is protected by
743 * spinlocks and the transient state of refilling doesn't change the
744 * recording of which entry is oldest.
745 *
746 * This relies on IB only calling one cq comp_handler for each cq so that
747 * there will only be one caller of rds_recv_incoming() per RDS connection.
748 */
749void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
750{
751 struct rds_connection *conn = context;
752 struct rds_ib_connection *ic = conn->c_transport_data;
753 struct ib_wc wc;
754 struct rds_ib_ack_state state = { 0, };
755 struct rds_ib_recv_work *recv;
756
757 rdsdebug("conn %p cq %p\n", conn, cq);
758
759 rds_ib_stats_inc(s_ib_rx_cq_call);
760
761 ib_req_notify_cq(cq, IB_CQ_SOLICITED);
762
763 while (ib_poll_cq(cq, 1, &wc) > 0) {
764 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
765 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
766 be32_to_cpu(wc.ex.imm_data));
767 rds_ib_stats_inc(s_ib_rx_cq_event);
768
769 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
770
771 rds_ib_recv_unmap_page(ic, recv);
772
773 /*
774 * Also process recvs in connecting state because it is possible
775 * to get a recv completion _before_ the rdmacm ESTABLISHED
776 * event is processed.
777 */
778 if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
779 /* We expect errors as the qp is drained during shutdown */
780 if (wc.status == IB_WC_SUCCESS) {
781 rds_ib_process_recv(conn, recv, wc.byte_len, &state);
782 } else {
783 rds_ib_conn_error(conn, "recv completion on "
784 "%pI4 had status %u, disconnecting and "
785 "reconnecting\n", &conn->c_faddr,
786 wc.status);
787 }
788 }
789
790 rds_ib_ring_free(&ic->i_recv_ring, 1);
791 }
792
793 if (state.ack_next_valid)
794 rds_ib_set_ack(ic, state.ack_next, state.ack_required);
795 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
796 rds_send_drop_acked(conn, state.ack_recv, NULL);
797 ic->i_ack_recv = state.ack_recv;
798 }
799 if (rds_conn_up(conn))
800 rds_ib_attempt_ack(ic);
801
802 /* If we ever end up with a really empty receive ring, we're
803 * in deep trouble, as the sender will definitely see RNR
804 * timeouts. */
805 if (rds_ib_ring_empty(&ic->i_recv_ring))
806 rds_ib_stats_inc(s_ib_rx_ring_empty);
807
808 /*
809 * If the ring is running low, then schedule the thread to refill.
810 */
811 if (rds_ib_ring_low(&ic->i_recv_ring))
812 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
813}
814
815int rds_ib_recv(struct rds_connection *conn)
816{
817 struct rds_ib_connection *ic = conn->c_transport_data;
818 int ret = 0;
819
820 rdsdebug("conn %p\n", conn);
821
822 /*
823 * If we get a temporary posting failure in this context then
824 * we're really low and we want the caller to back off for a bit.
825 */
826 mutex_lock(&ic->i_recv_mutex);
827 if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
828 ret = -ENOMEM;
829 else
830 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
831 mutex_unlock(&ic->i_recv_mutex);
832
833 if (rds_conn_up(conn))
834 rds_ib_attempt_ack(ic);
835
836 return ret;
837}
838
839int __init rds_ib_recv_init(void)
840{
841 struct sysinfo si;
842 int ret = -ENOMEM;
843
844 /* Default to 30% of all available RAM for recv memory */
845 si_meminfo(&si);
846 rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
847
848 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
849 sizeof(struct rds_ib_incoming),
850 0, 0, NULL);
851 if (rds_ib_incoming_slab == NULL)
852 goto out;
853
854 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
855 sizeof(struct rds_page_frag),
856 0, 0, NULL);
857 if (rds_ib_frag_slab == NULL)
858 kmem_cache_destroy(rds_ib_incoming_slab);
859 else
860 ret = 0;
861out:
862 return ret;
863}
864
865void rds_ib_recv_exit(void)
866{
867 kmem_cache_destroy(rds_ib_incoming_slab);
868 kmem_cache_destroy(rds_ib_frag_slab);
869}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 000000000000..99a6ccae964c
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "ib.h"
37
38/*
39 * Locking for IB rings.
40 * We assume that allocation is always protected by a mutex
41 * in the caller (this is a valid assumption for the current
42 * implementation).
43 *
44 * Freeing always happens in an interrupt, and hence only
45 * races with allocations, but not with other free()s.
46 *
47 * The interaction between allocation and freeing is that
48 * the alloc code has to determine the number of free entries.
49 * To this end, we maintain two counters; an allocation counter
50 * and a free counter. Both are allowed to run freely, and wrap
51 * around.
52 * The number of used entries is always (alloc_ctr - free_ctr) % NR.
53 *
54 * The current implementation makes free_ctr atomic. When the
55 * caller finds an allocation fails, it should set an "alloc fail"
56 * bit and retry the allocation. The "alloc fail" bit essentially tells
57 * the CQ completion handlers to wake it up after freeing some
58 * more entries.
59 */
60
61/*
62 * This only happens on shutdown.
63 */
64DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
65
66void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
67{
68 memset(ring, 0, sizeof(*ring));
69 ring->w_nr = nr;
70 rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
71}
72
73static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
74{
75 u32 diff;
76
77 /* This assumes that atomic_t has at least as many bits as u32 */
78 diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
79 BUG_ON(diff > ring->w_nr);
80
81 return diff;
82}
83
84void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
85{
86 /* We only ever get called from the connection setup code,
87 * prior to creating the QP. */
88 BUG_ON(__rds_ib_ring_used(ring));
89 ring->w_nr = nr;
90}
91
92static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
93{
94 return __rds_ib_ring_used(ring) == 0;
95}
96
97u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
98{
99 u32 ret = 0, avail;
100
101 avail = ring->w_nr - __rds_ib_ring_used(ring);
102
103 rdsdebug("ring %p val %u next %u free %u\n", ring, val,
104 ring->w_alloc_ptr, avail);
105
106 if (val && avail) {
107 ret = min(val, avail);
108 *pos = ring->w_alloc_ptr;
109
110 ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
111 ring->w_alloc_ctr += ret;
112 }
113
114 return ret;
115}
116
117void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
118{
119 ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
120 atomic_add(val, &ring->w_free_ctr);
121
122 if (__rds_ib_ring_empty(ring) &&
123 waitqueue_active(&rds_ib_ring_empty_wait))
124 wake_up(&rds_ib_ring_empty_wait);
125}
126
127void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
128{
129 ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
130 ring->w_alloc_ctr -= val;
131}
132
133int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
134{
135 return __rds_ib_ring_empty(ring);
136}
137
138int rds_ib_ring_low(struct rds_ib_work_ring *ring)
139{
140 return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
141}
142
143/*
144 * returns the oldest alloced ring entry. This will be the next one
145 * freed. This can't be called if there are none allocated.
146 */
147u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
148{
149 return ring->w_free_ptr;
150}
151
152/*
153 * returns the number of completed work requests.
154 */
155
156u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
157{
158 u32 ret;
159
160 if (oldest <= (unsigned long long)wr_id)
161 ret = (unsigned long long)wr_id - oldest + 1;
162 else
163 ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
164
165 rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
166 wr_id, oldest);
167 return ret;
168}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 000000000000..cb6c52cb1c4c
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,874 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/device.h>
36#include <linux/dmapool.h>
37
38#include "rds.h"
39#include "rdma.h"
40#include "ib.h"
41
42static void rds_ib_send_rdma_complete(struct rds_message *rm,
43 int wc_status)
44{
45 int notify_status;
46
47 switch (wc_status) {
48 case IB_WC_WR_FLUSH_ERR:
49 return;
50
51 case IB_WC_SUCCESS:
52 notify_status = RDS_RDMA_SUCCESS;
53 break;
54
55 case IB_WC_REM_ACCESS_ERR:
56 notify_status = RDS_RDMA_REMOTE_ERROR;
57 break;
58
59 default:
60 notify_status = RDS_RDMA_OTHER_ERROR;
61 break;
62 }
63 rds_rdma_send_complete(rm, notify_status);
64}
65
66static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
67 struct rds_rdma_op *op)
68{
69 if (op->r_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0;
74 }
75}
76
77static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
78 struct rds_ib_send_work *send,
79 int wc_status)
80{
81 struct rds_message *rm = send->s_rm;
82
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
84
85 ib_dma_unmap_sg(ic->i_cm_id->device,
86 rm->m_sg, rm->m_nents,
87 DMA_TO_DEVICE);
88
89 if (rm->m_rdma_op != NULL) {
90 rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
91
92 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics:
94 * 1. Notify when we received the ACK on the RDS message
95 * that was queued with the RDMA. This provides reliable
96 * notification of RDMA status at the expense of a one-way
97 * packet delay.
98 * 2. Notify when the IB stack gives us the completion event for
99 * the RDMA operation.
100 * 3. Notify when the IB stack gives us the completion event for
101 * the accompanying RDS messages.
102 * Here, we implement approach #3. To implement approach #2,
103 * call rds_rdma_send_complete from the cq_handler. To implement #1,
104 * don't call rds_rdma_send_complete at all, and fall back to the notify
105 * handling in the ACK processing code.
106 *
107 * Note: There's no need to explicitly sync any RDMA buffers using
108 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
109 * operation itself unmapped the RDMA buffers, which takes care
110 * of synching.
111 */
112 rds_ib_send_rdma_complete(rm, wc_status);
113
114 if (rm->m_rdma_op->r_write)
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
116 else
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
118 }
119
120 /* If anyone waited for this message to get flushed out, wake
121 * them up now */
122 rds_message_unmapped(rm);
123
124 rds_message_put(rm);
125 send->s_rm = NULL;
126}
127
128void rds_ib_send_init_ring(struct rds_ib_connection *ic)
129{
130 struct rds_ib_send_work *send;
131 u32 i;
132
133 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
134 struct ib_sge *sge;
135
136 send->s_rm = NULL;
137 send->s_op = NULL;
138
139 send->s_wr.wr_id = i;
140 send->s_wr.sg_list = send->s_sge;
141 send->s_wr.num_sge = 1;
142 send->s_wr.opcode = IB_WR_SEND;
143 send->s_wr.send_flags = 0;
144 send->s_wr.ex.imm_data = 0;
145
146 sge = rds_ib_data_sge(ic, send->s_sge);
147 sge->lkey = ic->i_mr->lkey;
148
149 sge = rds_ib_header_sge(ic, send->s_sge);
150 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
151 sge->length = sizeof(struct rds_header);
152 sge->lkey = ic->i_mr->lkey;
153 }
154}
155
156void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
157{
158 struct rds_ib_send_work *send;
159 u32 i;
160
161 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
162 if (send->s_wr.opcode == 0xdead)
163 continue;
164 if (send->s_rm)
165 rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
166 if (send->s_op)
167 rds_ib_send_unmap_rdma(ic, send->s_op);
168 }
169}
170
171/*
172 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
173 * operations performed in the send path. As the sender allocs and potentially
174 * unallocs the next free entry in the ring it doesn't alter which is
175 * the next to be freed, which is what this is concerned with.
176 */
177void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
178{
179 struct rds_connection *conn = context;
180 struct rds_ib_connection *ic = conn->c_transport_data;
181 struct ib_wc wc;
182 struct rds_ib_send_work *send;
183 u32 completed;
184 u32 oldest;
185 u32 i = 0;
186 int ret;
187
188 rdsdebug("cq %p conn %p\n", cq, conn);
189 rds_ib_stats_inc(s_ib_tx_cq_call);
190 ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
191 if (ret)
192 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
193
194 while (ib_poll_cq(cq, 1, &wc) > 0) {
195 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
196 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
197 be32_to_cpu(wc.ex.imm_data));
198 rds_ib_stats_inc(s_ib_tx_cq_event);
199
200 if (wc.wr_id == RDS_IB_ACK_WR_ID) {
201 if (ic->i_ack_queued + HZ/2 < jiffies)
202 rds_ib_stats_inc(s_ib_tx_stalled);
203 rds_ib_ack_send_complete(ic);
204 continue;
205 }
206
207 oldest = rds_ib_ring_oldest(&ic->i_send_ring);
208
209 completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
210
211 for (i = 0; i < completed; i++) {
212 send = &ic->i_sends[oldest];
213
214 /* In the error case, wc.opcode sometimes contains garbage */
215 switch (send->s_wr.opcode) {
216 case IB_WR_SEND:
217 if (send->s_rm)
218 rds_ib_send_unmap_rm(ic, send, wc.status);
219 break;
220 case IB_WR_RDMA_WRITE:
221 case IB_WR_RDMA_READ:
222 /* Nothing to be done - the SG list will be unmapped
223 * when the SEND completes. */
224 break;
225 default:
226 if (printk_ratelimit())
227 printk(KERN_NOTICE
228 "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
229 __func__, send->s_wr.opcode);
230 break;
231 }
232
233 send->s_wr.opcode = 0xdead;
234 send->s_wr.num_sge = 1;
235 if (send->s_queued + HZ/2 < jiffies)
236 rds_ib_stats_inc(s_ib_tx_stalled);
237
238 /* If a RDMA operation produced an error, signal this right
239 * away. If we don't, the subsequent SEND that goes with this
240 * RDMA will be canceled with ERR_WFLUSH, and the application
241 * never learn that the RDMA failed. */
242 if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
243 struct rds_message *rm;
244
245 rm = rds_send_get_message(conn, send->s_op);
246 if (rm)
247 rds_ib_send_rdma_complete(rm, wc.status);
248 }
249
250 oldest = (oldest + 1) % ic->i_send_ring.w_nr;
251 }
252
253 rds_ib_ring_free(&ic->i_send_ring, completed);
254
255 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
256 || test_bit(0, &conn->c_map_queued))
257 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
258
259 /* We expect errors as the qp is drained during shutdown */
260 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
261 rds_ib_conn_error(conn,
262 "send completion on %pI4 "
263 "had status %u, disconnecting and reconnecting\n",
264 &conn->c_faddr, wc.status);
265 }
266 }
267}
268
269/*
270 * This is the main function for allocating credits when sending
271 * messages.
272 *
273 * Conceptually, we have two counters:
274 * - send credits: this tells us how many WRs we're allowed
275 * to submit without overruning the reciever's queue. For
276 * each SEND WR we post, we decrement this by one.
277 *
278 * - posted credits: this tells us how many WRs we recently
279 * posted to the receive queue. This value is transferred
280 * to the peer as a "credit update" in a RDS header field.
281 * Every time we transmit credits to the peer, we subtract
282 * the amount of transferred credits from this counter.
283 *
284 * It is essential that we avoid situations where both sides have
285 * exhausted their send credits, and are unable to send new credits
286 * to the peer. We achieve this by requiring that we send at least
287 * one credit update to the peer before exhausting our credits.
288 * When new credits arrive, we subtract one credit that is withheld
289 * until we've posted new buffers and are ready to transmit these
290 * credits (see rds_ib_send_add_credits below).
291 *
292 * The RDS send code is essentially single-threaded; rds_send_xmit
293 * grabs c_send_lock to ensure exclusive access to the send ring.
294 * However, the ACK sending code is independent and can race with
295 * message SENDs.
296 *
297 * In the send path, we need to update the counters for send credits
298 * and the counter of posted buffers atomically - when we use the
299 * last available credit, we cannot allow another thread to race us
300 * and grab the posted credits counter. Hence, we have to use a
301 * spinlock to protect the credit counter, or use atomics.
302 *
303 * Spinlocks shared between the send and the receive path are bad,
304 * because they create unnecessary delays. An early implementation
305 * using a spinlock showed a 5% degradation in throughput at some
306 * loads.
307 *
308 * This implementation avoids spinlocks completely, putting both
309 * counters into a single atomic, and updating that atomic using
310 * atomic_add (in the receive path, when receiving fresh credits),
311 * and using atomic_cmpxchg when updating the two counters.
312 */
313int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
314 u32 wanted, u32 *adv_credits, int need_posted)
315{
316 unsigned int avail, posted, got = 0, advertise;
317 long oldval, newval;
318
319 *adv_credits = 0;
320 if (!ic->i_flowctl)
321 return wanted;
322
323try_again:
324 advertise = 0;
325 oldval = newval = atomic_read(&ic->i_credits);
326 posted = IB_GET_POST_CREDITS(oldval);
327 avail = IB_GET_SEND_CREDITS(oldval);
328
329 rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
330 wanted, avail, posted);
331
332 /* The last credit must be used to send a credit update. */
333 if (avail && !posted)
334 avail--;
335
336 if (avail < wanted) {
337 struct rds_connection *conn = ic->i_cm_id->context;
338
339 /* Oops, there aren't that many credits left! */
340 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
341 got = avail;
342 } else {
343 /* Sometimes you get what you want, lalala. */
344 got = wanted;
345 }
346 newval -= IB_SET_SEND_CREDITS(got);
347
348 /*
349 * If need_posted is non-zero, then the caller wants
350 * the posted regardless of whether any send credits are
351 * available.
352 */
353 if (posted && (got || need_posted)) {
354 advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
355 newval -= IB_SET_POST_CREDITS(advertise);
356 }
357
358 /* Finally bill everything */
359 if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
360 goto try_again;
361
362 *adv_credits = advertise;
363 return got;
364}
365
366void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
367{
368 struct rds_ib_connection *ic = conn->c_transport_data;
369
370 if (credits == 0)
371 return;
372
373 rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
374 credits,
375 IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
376 test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
377
378 atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
379 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
380 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
381
382 WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
383
384 rds_ib_stats_inc(s_ib_rx_credit_updates);
385}
386
387void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
388{
389 struct rds_ib_connection *ic = conn->c_transport_data;
390
391 if (posted == 0)
392 return;
393
394 atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
395
396 /* Decide whether to send an update to the peer now.
397 * If we would send a credit update for every single buffer we
398 * post, we would end up with an ACK storm (ACK arrives,
399 * consumes buffer, we refill the ring, send ACK to remote
400 * advertising the newly posted buffer... ad inf)
401 *
402 * Performance pretty much depends on how often we send
403 * credit updates - too frequent updates mean lots of ACKs.
404 * Too infrequent updates, and the peer will run out of
405 * credits and has to throttle.
406 * For the time being, 16 seems to be a good compromise.
407 */
408 if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
409 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
410}
411
412static inline void
413rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
414 struct rds_ib_send_work *send, unsigned int pos,
415 unsigned long buffer, unsigned int length,
416 int send_flags)
417{
418 struct ib_sge *sge;
419
420 WARN_ON(pos != send - ic->i_sends);
421
422 send->s_wr.send_flags = send_flags;
423 send->s_wr.opcode = IB_WR_SEND;
424 send->s_wr.num_sge = 2;
425 send->s_wr.next = NULL;
426 send->s_queued = jiffies;
427 send->s_op = NULL;
428
429 if (length != 0) {
430 sge = rds_ib_data_sge(ic, send->s_sge);
431 sge->addr = buffer;
432 sge->length = length;
433 sge->lkey = ic->i_mr->lkey;
434
435 sge = rds_ib_header_sge(ic, send->s_sge);
436 } else {
437 /* We're sending a packet with no payload. There is only
438 * one SGE */
439 send->s_wr.num_sge = 1;
440 sge = &send->s_sge[0];
441 }
442
443 sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
444 sge->length = sizeof(struct rds_header);
445 sge->lkey = ic->i_mr->lkey;
446}
447
448/*
449 * This can be called multiple times for a given message. The first time
450 * we see a message we map its scatterlist into the IB device so that
451 * we can provide that mapped address to the IB scatter gather entries
452 * in the IB work requests. We translate the scatterlist into a series
453 * of work requests that fragment the message. These work requests complete
454 * in order so we pass ownership of the message to the completion handler
455 * once we send the final fragment.
456 *
457 * The RDS core uses the c_send_lock to only enter this function once
458 * per connection. This makes sure that the tx ring alloc/unalloc pairs
459 * don't get out of sync and confuse the ring.
460 */
461int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
462 unsigned int hdr_off, unsigned int sg, unsigned int off)
463{
464 struct rds_ib_connection *ic = conn->c_transport_data;
465 struct ib_device *dev = ic->i_cm_id->device;
466 struct rds_ib_send_work *send = NULL;
467 struct rds_ib_send_work *first;
468 struct rds_ib_send_work *prev;
469 struct ib_send_wr *failed_wr;
470 struct scatterlist *scat;
471 u32 pos;
472 u32 i;
473 u32 work_alloc;
474 u32 credit_alloc;
475 u32 posted;
476 u32 adv_credits = 0;
477 int send_flags = 0;
478 int sent;
479 int ret;
480 int flow_controlled = 0;
481
482 BUG_ON(off % RDS_FRAG_SIZE);
483 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
484
485 /* FIXME we may overallocate here */
486 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
487 i = 1;
488 else
489 i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
490
491 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
492 if (work_alloc == 0) {
493 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
494 rds_ib_stats_inc(s_ib_tx_ring_full);
495 ret = -ENOMEM;
496 goto out;
497 }
498
499 credit_alloc = work_alloc;
500 if (ic->i_flowctl) {
501 credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
502 adv_credits += posted;
503 if (credit_alloc < work_alloc) {
504 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
505 work_alloc = credit_alloc;
506 flow_controlled++;
507 }
508 if (work_alloc == 0) {
509 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
510 rds_ib_stats_inc(s_ib_tx_throttle);
511 ret = -ENOMEM;
512 goto out;
513 }
514 }
515
516 /* map the message the first time we see it */
517 if (ic->i_rm == NULL) {
518 /*
519 printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
520 be16_to_cpu(rm->m_inc.i_hdr.h_dport),
521 rm->m_inc.i_hdr.h_flags,
522 be32_to_cpu(rm->m_inc.i_hdr.h_len));
523 */
524 if (rm->m_nents) {
525 rm->m_count = ib_dma_map_sg(dev,
526 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
527 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
528 if (rm->m_count == 0) {
529 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
530 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
531 ret = -ENOMEM; /* XXX ? */
532 goto out;
533 }
534 } else {
535 rm->m_count = 0;
536 }
537
538 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
539 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
540 rds_message_addref(rm);
541 ic->i_rm = rm;
542
543 /* Finalize the header */
544 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
545 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
546 if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
547 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
548
549 /* If it has a RDMA op, tell the peer we did it. This is
550 * used by the peer to release use-once RDMA MRs. */
551 if (rm->m_rdma_op) {
552 struct rds_ext_header_rdma ext_hdr;
553
554 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
555 rds_message_add_extension(&rm->m_inc.i_hdr,
556 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
557 }
558 if (rm->m_rdma_cookie) {
559 rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
560 rds_rdma_cookie_key(rm->m_rdma_cookie),
561 rds_rdma_cookie_offset(rm->m_rdma_cookie));
562 }
563
564 /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
565 * we should not do this unless we have a chance of at least
566 * sticking the header into the send ring. Which is why we
567 * should call rds_ib_ring_alloc first. */
568 rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
569 rds_message_make_checksum(&rm->m_inc.i_hdr);
570
571 /*
572 * Update adv_credits since we reset the ACK_REQUIRED bit.
573 */
574 rds_ib_send_grab_credits(ic, 0, &posted, 1);
575 adv_credits += posted;
576 BUG_ON(adv_credits > 255);
577 } else if (ic->i_rm != rm)
578 BUG();
579
580 send = &ic->i_sends[pos];
581 first = send;
582 prev = NULL;
583 scat = &rm->m_sg[sg];
584 sent = 0;
585 i = 0;
586
587 /* Sometimes you want to put a fence between an RDMA
588 * READ and the following SEND.
589 * We could either do this all the time
590 * or when requested by the user. Right now, we let
591 * the application choose.
592 */
593 if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
594 send_flags = IB_SEND_FENCE;
595
596 /*
597 * We could be copying the header into the unused tail of the page.
598 * That would need to be changed in the future when those pages might
599 * be mapped userspace pages or page cache pages. So instead we always
600 * use a second sge and our long-lived ring of mapped headers. We send
601 * the header after the data so that the data payload can be aligned on
602 * the receiver.
603 */
604
605 /* handle a 0-len message */
606 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
607 rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
608 goto add_header;
609 }
610
611 /* if there's data reference it with a chain of work reqs */
612 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
613 unsigned int len;
614
615 send = &ic->i_sends[pos];
616
617 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
618 rds_ib_xmit_populate_wr(ic, send, pos,
619 ib_sg_dma_address(dev, scat) + off, len,
620 send_flags);
621
622 /*
623 * We want to delay signaling completions just enough to get
624 * the batching benefits but not so much that we create dead time
625 * on the wire.
626 */
627 if (ic->i_unsignaled_wrs-- == 0) {
628 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
629 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
630 }
631
632 ic->i_unsignaled_bytes -= len;
633 if (ic->i_unsignaled_bytes <= 0) {
634 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
635 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
636 }
637
638 /*
639 * Always signal the last one if we're stopping due to flow control.
640 */
641 if (flow_controlled && i == (work_alloc-1))
642 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
643
644 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
645 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
646
647 sent += len;
648 off += len;
649 if (off == ib_sg_dma_len(dev, scat)) {
650 scat++;
651 off = 0;
652 }
653
654add_header:
655 /* Tack on the header after the data. The header SGE should already
656 * have been set up to point to the right header buffer. */
657 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
658
659 if (0) {
660 struct rds_header *hdr = &ic->i_send_hdrs[pos];
661
662 printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
663 be16_to_cpu(hdr->h_dport),
664 hdr->h_flags,
665 be32_to_cpu(hdr->h_len));
666 }
667 if (adv_credits) {
668 struct rds_header *hdr = &ic->i_send_hdrs[pos];
669
670 /* add credit and redo the header checksum */
671 hdr->h_credit = adv_credits;
672 rds_message_make_checksum(hdr);
673 adv_credits = 0;
674 rds_ib_stats_inc(s_ib_tx_credit_updates);
675 }
676
677 if (prev)
678 prev->s_wr.next = &send->s_wr;
679 prev = send;
680
681 pos = (pos + 1) % ic->i_send_ring.w_nr;
682 }
683
684 /* Account the RDS header in the number of bytes we sent, but just once.
685 * The caller has no concept of fragmentation. */
686 if (hdr_off == 0)
687 sent += sizeof(struct rds_header);
688
689 /* if we finished the message then send completion owns it */
690 if (scat == &rm->m_sg[rm->m_count]) {
691 prev->s_rm = ic->i_rm;
692 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
693 ic->i_rm = NULL;
694 }
695
696 if (i < work_alloc) {
697 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
698 work_alloc = i;
699 }
700 if (ic->i_flowctl && i < credit_alloc)
701 rds_ib_send_add_credits(conn, credit_alloc - i);
702
703 /* XXX need to worry about failed_wr and partial sends. */
704 failed_wr = &first->s_wr;
705 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
706 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
707 first, &first->s_wr, ret, failed_wr);
708 BUG_ON(failed_wr != &first->s_wr);
709 if (ret) {
710 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
711 "returned %d\n", &conn->c_faddr, ret);
712 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
713 if (prev->s_rm) {
714 ic->i_rm = prev->s_rm;
715 prev->s_rm = NULL;
716 }
717 /* Finesse this later */
718 BUG();
719 goto out;
720 }
721
722 ret = sent;
723out:
724 BUG_ON(adv_credits);
725 return ret;
726}
727
728int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
729{
730 struct rds_ib_connection *ic = conn->c_transport_data;
731 struct rds_ib_send_work *send = NULL;
732 struct rds_ib_send_work *first;
733 struct rds_ib_send_work *prev;
734 struct ib_send_wr *failed_wr;
735 struct rds_ib_device *rds_ibdev;
736 struct scatterlist *scat;
737 unsigned long len;
738 u64 remote_addr = op->r_remote_addr;
739 u32 pos;
740 u32 work_alloc;
741 u32 i;
742 u32 j;
743 int sent;
744 int ret;
745 int num_sge;
746
747 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
748
749 /* map the message the first time we see it */
750 if (!op->r_mapped) {
751 op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
752 op->r_sg, op->r_nents, (op->r_write) ?
753 DMA_TO_DEVICE : DMA_FROM_DEVICE);
754 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
755 if (op->r_count == 0) {
756 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
757 ret = -ENOMEM; /* XXX ? */
758 goto out;
759 }
760
761 op->r_mapped = 1;
762 }
763
764 /*
765 * Instead of knowing how to return a partial rdma read/write we insist that there
766 * be enough work requests to send the entire message.
767 */
768 i = ceil(op->r_count, rds_ibdev->max_sge);
769
770 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
771 if (work_alloc != i) {
772 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
773 rds_ib_stats_inc(s_ib_tx_ring_full);
774 ret = -ENOMEM;
775 goto out;
776 }
777
778 send = &ic->i_sends[pos];
779 first = send;
780 prev = NULL;
781 scat = &op->r_sg[0];
782 sent = 0;
783 num_sge = op->r_count;
784
785 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
786 send->s_wr.send_flags = 0;
787 send->s_queued = jiffies;
788 /*
789 * We want to delay signaling completions just enough to get
790 * the batching benefits but not so much that we create dead time on the wire.
791 */
792 if (ic->i_unsignaled_wrs-- == 0) {
793 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
794 send->s_wr.send_flags = IB_SEND_SIGNALED;
795 }
796
797 send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
798 send->s_wr.wr.rdma.remote_addr = remote_addr;
799 send->s_wr.wr.rdma.rkey = op->r_key;
800 send->s_op = op;
801
802 if (num_sge > rds_ibdev->max_sge) {
803 send->s_wr.num_sge = rds_ibdev->max_sge;
804 num_sge -= rds_ibdev->max_sge;
805 } else {
806 send->s_wr.num_sge = num_sge;
807 }
808
809 send->s_wr.next = NULL;
810
811 if (prev)
812 prev->s_wr.next = &send->s_wr;
813
814 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
815 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
816 send->s_sge[j].addr =
817 ib_sg_dma_address(ic->i_cm_id->device, scat);
818 send->s_sge[j].length = len;
819 send->s_sge[j].lkey = ic->i_mr->lkey;
820
821 sent += len;
822 rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
823
824 remote_addr += len;
825 scat++;
826 }
827
828 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
829 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
830
831 prev = send;
832 if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
833 send = ic->i_sends;
834 }
835
836 /* if we finished the message then send completion owns it */
837 if (scat == &op->r_sg[op->r_count])
838 prev->s_wr.send_flags = IB_SEND_SIGNALED;
839
840 if (i < work_alloc) {
841 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
842 work_alloc = i;
843 }
844
845 failed_wr = &first->s_wr;
846 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
847 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
848 first, &first->s_wr, ret, failed_wr);
849 BUG_ON(failed_wr != &first->s_wr);
850 if (ret) {
851 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
852 "returned %d\n", &conn->c_faddr, ret);
853 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
854 goto out;
855 }
856
857 if (unlikely(failed_wr != &first->s_wr)) {
858 printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
859 BUG_ON(failed_wr != &first->s_wr);
860 }
861
862
863out:
864 return ret;
865}
866
867void rds_ib_xmit_complete(struct rds_connection *conn)
868{
869 struct rds_ib_connection *ic = conn->c_transport_data;
870
871 /* We may have a pending ACK or window update we were unable
872 * to send previously (due to flow control). Try again. */
873 rds_ib_attempt_ack(ic);
874}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 000000000000..02e3e3d50d4a
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,95 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38#include "ib.h"
39
40DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
41
42static char *rds_ib_stat_names[] = {
43 "ib_connect_raced",
44 "ib_listen_closed_stale",
45 "ib_tx_cq_call",
46 "ib_tx_cq_event",
47 "ib_tx_ring_full",
48 "ib_tx_throttle",
49 "ib_tx_sg_mapping_failure",
50 "ib_tx_stalled",
51 "ib_tx_credit_updates",
52 "ib_rx_cq_call",
53 "ib_rx_cq_event",
54 "ib_rx_ring_empty",
55 "ib_rx_refill_from_cq",
56 "ib_rx_refill_from_thread",
57 "ib_rx_alloc_limit",
58 "ib_rx_credit_updates",
59 "ib_ack_sent",
60 "ib_ack_send_failure",
61 "ib_ack_send_delayed",
62 "ib_ack_send_piggybacked",
63 "ib_ack_received",
64 "ib_rdma_mr_alloc",
65 "ib_rdma_mr_free",
66 "ib_rdma_mr_used",
67 "ib_rdma_mr_pool_flush",
68 "ib_rdma_mr_pool_wait",
69 "ib_rdma_mr_pool_depleted",
70};
71
72unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
73 unsigned int avail)
74{
75 struct rds_ib_statistics stats = {0, };
76 uint64_t *src;
77 uint64_t *sum;
78 size_t i;
79 int cpu;
80
81 if (avail < ARRAY_SIZE(rds_ib_stat_names))
82 goto out;
83
84 for_each_online_cpu(cpu) {
85 src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
86 sum = (uint64_t *)&stats;
87 for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
88 *(sum++) += *(src++);
89 }
90
91 rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
92 ARRAY_SIZE(rds_ib_stat_names));
93out:
94 return ARRAY_SIZE(rds_ib_stat_names);
95}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 000000000000..d87830db93a0
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,137 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/sysctl.h>
35#include <linux/proc_fs.h>
36
37#include "ib.h"
38
39static struct ctl_table_header *rds_ib_sysctl_hdr;
40
41unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
42unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
43unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
44static unsigned long rds_ib_sysctl_max_wr_min = 1;
45/* hardware will fail CQ creation long before this */
46static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
47
48unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
49static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
50static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
51
52unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
53static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
54static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
55
56unsigned int rds_ib_sysctl_flow_control = 1;
57
58ctl_table rds_ib_sysctl_table[] = {
59 {
60 .ctl_name = CTL_UNNUMBERED,
61 .procname = "max_send_wr",
62 .data = &rds_ib_sysctl_max_send_wr,
63 .maxlen = sizeof(unsigned long),
64 .mode = 0644,
65 .proc_handler = &proc_doulongvec_minmax,
66 .extra1 = &rds_ib_sysctl_max_wr_min,
67 .extra2 = &rds_ib_sysctl_max_wr_max,
68 },
69 {
70 .ctl_name = CTL_UNNUMBERED,
71 .procname = "max_recv_wr",
72 .data = &rds_ib_sysctl_max_recv_wr,
73 .maxlen = sizeof(unsigned long),
74 .mode = 0644,
75 .proc_handler = &proc_doulongvec_minmax,
76 .extra1 = &rds_ib_sysctl_max_wr_min,
77 .extra2 = &rds_ib_sysctl_max_wr_max,
78 },
79 {
80 .ctl_name = CTL_UNNUMBERED,
81 .procname = "max_unsignaled_wr",
82 .data = &rds_ib_sysctl_max_unsig_wrs,
83 .maxlen = sizeof(unsigned long),
84 .mode = 0644,
85 .proc_handler = &proc_doulongvec_minmax,
86 .extra1 = &rds_ib_sysctl_max_unsig_wr_min,
87 .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
88 },
89 {
90 .ctl_name = CTL_UNNUMBERED,
91 .procname = "max_unsignaled_bytes",
92 .data = &rds_ib_sysctl_max_unsig_bytes,
93 .maxlen = sizeof(unsigned long),
94 .mode = 0644,
95 .proc_handler = &proc_doulongvec_minmax,
96 .extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
97 .extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
98 },
99 {
100 .ctl_name = CTL_UNNUMBERED,
101 .procname = "max_recv_allocation",
102 .data = &rds_ib_sysctl_max_recv_allocation,
103 .maxlen = sizeof(unsigned long),
104 .mode = 0644,
105 .proc_handler = &proc_doulongvec_minmax,
106 },
107 {
108 .ctl_name = CTL_UNNUMBERED,
109 .procname = "flow_control",
110 .data = &rds_ib_sysctl_flow_control,
111 .maxlen = sizeof(rds_ib_sysctl_flow_control),
112 .mode = 0644,
113 .proc_handler = &proc_dointvec,
114 },
115 { .ctl_name = 0}
116};
117
118static struct ctl_path rds_ib_sysctl_path[] = {
119 { .procname = "net", .ctl_name = CTL_NET, },
120 { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
121 { .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
122 { }
123};
124
125void rds_ib_sysctl_exit(void)
126{
127 if (rds_ib_sysctl_hdr)
128 unregister_sysctl_table(rds_ib_sysctl_hdr);
129}
130
131int __init rds_ib_sysctl_init(void)
132{
133 rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
134 if (rds_ib_sysctl_hdr == NULL)
135 return -ENOMEM;
136 return 0;
137}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 000000000000..1d885535214d
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,241 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38
39/*
40 * This file implements a getsockopt() call which copies a set of fixed
41 * sized structs into a user-specified buffer as a means of providing
42 * read-only information about RDS.
43 *
44 * For a given information source there are a given number of fixed sized
45 * structs at a given time. The structs are only copied if the user-specified
46 * buffer is big enough. The destination pages that make up the buffer
47 * are pinned for the duration of the copy.
48 *
49 * This gives us the following benefits:
50 *
51 * - simple implementation, no copy "position" across multiple calls
52 * - consistent snapshot of an info source
53 * - atomic copy works well with whatever locking info source has
54 * - one portable tool to get rds info across implementations
55 * - long-lived tool can get info without allocating
56 *
57 * at the following costs:
58 *
59 * - info source copy must be pinned, may be "large"
60 */
61
62struct rds_info_iterator {
63 struct page **pages;
64 void *addr;
65 unsigned long offset;
66};
67
68static DEFINE_SPINLOCK(rds_info_lock);
69static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
70
71void rds_info_register_func(int optname, rds_info_func func)
72{
73 int offset = optname - RDS_INFO_FIRST;
74
75 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
76
77 spin_lock(&rds_info_lock);
78 BUG_ON(rds_info_funcs[offset] != NULL);
79 rds_info_funcs[offset] = func;
80 spin_unlock(&rds_info_lock);
81}
82
83void rds_info_deregister_func(int optname, rds_info_func func)
84{
85 int offset = optname - RDS_INFO_FIRST;
86
87 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
88
89 spin_lock(&rds_info_lock);
90 BUG_ON(rds_info_funcs[offset] != func);
91 rds_info_funcs[offset] = NULL;
92 spin_unlock(&rds_info_lock);
93}
94
95/*
96 * Typically we hold an atomic kmap across multiple rds_info_copy() calls
97 * because the kmap is so expensive. This must be called before using blocking
98 * operations while holding the mapping and as the iterator is torn down.
99 */
100void rds_info_iter_unmap(struct rds_info_iterator *iter)
101{
102 if (iter->addr != NULL) {
103 kunmap_atomic(iter->addr, KM_USER0);
104 iter->addr = NULL;
105 }
106}
107
108/*
109 * get_user_pages() called flush_dcache_page() on the pages for us.
110 */
111void rds_info_copy(struct rds_info_iterator *iter, void *data,
112 unsigned long bytes)
113{
114 unsigned long this;
115
116 while (bytes) {
117 if (iter->addr == NULL)
118 iter->addr = kmap_atomic(*iter->pages, KM_USER0);
119
120 this = min(bytes, PAGE_SIZE - iter->offset);
121
122 rdsdebug("page %p addr %p offset %lu this %lu data %p "
123 "bytes %lu\n", *iter->pages, iter->addr,
124 iter->offset, this, data, bytes);
125
126 memcpy(iter->addr + iter->offset, data, this);
127
128 data += this;
129 bytes -= this;
130 iter->offset += this;
131
132 if (iter->offset == PAGE_SIZE) {
133 kunmap_atomic(iter->addr, KM_USER0);
134 iter->addr = NULL;
135 iter->offset = 0;
136 iter->pages++;
137 }
138 }
139}
140
141/*
142 * @optval points to the userspace buffer that the information snapshot
143 * will be copied into.
144 *
145 * @optlen on input is the size of the buffer in userspace. @optlen
146 * on output is the size of the requested snapshot in bytes.
147 *
148 * This function returns -errno if there is a failure, particularly -ENOSPC
149 * if the given userspace buffer was not large enough to fit the snapshot.
150 * On success it returns the positive number of bytes of each array element
151 * in the snapshot.
152 */
153int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
154 int __user *optlen)
155{
156 struct rds_info_iterator iter;
157 struct rds_info_lengths lens;
158 unsigned long nr_pages = 0;
159 unsigned long start;
160 unsigned long i;
161 rds_info_func func;
162 struct page **pages = NULL;
163 int ret;
164 int len;
165 int total;
166
167 if (get_user(len, optlen)) {
168 ret = -EFAULT;
169 goto out;
170 }
171
172 /* check for all kinds of wrapping and the like */
173 start = (unsigned long)optval;
174 if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
175 ret = -EINVAL;
176 goto out;
177 }
178
179 /* a 0 len call is just trying to probe its length */
180 if (len == 0)
181 goto call_func;
182
183 nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
184 >> PAGE_SHIFT;
185
186 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
187 if (pages == NULL) {
188 ret = -ENOMEM;
189 goto out;
190 }
191 down_read(&current->mm->mmap_sem);
192 ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
193 pages, NULL);
194 up_read(&current->mm->mmap_sem);
195 if (ret != nr_pages) {
196 if (ret > 0)
197 nr_pages = ret;
198 else
199 nr_pages = 0;
200 ret = -EAGAIN; /* XXX ? */
201 goto out;
202 }
203
204 rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
205
206call_func:
207 func = rds_info_funcs[optname - RDS_INFO_FIRST];
208 if (func == NULL) {
209 ret = -ENOPROTOOPT;
210 goto out;
211 }
212
213 iter.pages = pages;
214 iter.addr = NULL;
215 iter.offset = start & (PAGE_SIZE - 1);
216
217 func(sock, len, &iter, &lens);
218 BUG_ON(lens.each == 0);
219
220 total = lens.nr * lens.each;
221
222 rds_info_iter_unmap(&iter);
223
224 if (total > len) {
225 len = total;
226 ret = -ENOSPC;
227 } else {
228 len = total;
229 ret = lens.each;
230 }
231
232 if (put_user(len, optlen))
233 ret = -EFAULT;
234
235out:
236 for (i = 0; pages != NULL && i < nr_pages; i++)
237 put_page(pages[i]);
238 kfree(pages);
239
240 return ret;
241}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 000000000000..b6c052ca7d22
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
1#ifndef _RDS_INFO_H
2#define _RDS_INFO_H
3
4struct rds_info_lengths {
5 unsigned int nr;
6 unsigned int each;
7};
8
9struct rds_info_iterator;
10
11/*
12 * These functions must fill in the fields of @lens to reflect the size
13 * of the available info source. If the snapshot fits in @len then it
14 * should be copied using @iter. The caller will deduce if it was copied
15 * or not by comparing the lengths.
16 */
17typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
18 struct rds_info_iterator *iter,
19 struct rds_info_lengths *lens);
20
21void rds_info_register_func(int optname, rds_info_func func);
22void rds_info_deregister_func(int optname, rds_info_func func);
23int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
24 int __user *optlen);
25void rds_info_copy(struct rds_info_iterator *iter, void *data,
26 unsigned long bytes);
27void rds_info_iter_unmap(struct rds_info_iterator *iter);
28
29
30#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 000000000000..1b56905c4c08
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,333 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/if.h>
36#include <linux/netdevice.h>
37#include <linux/inetdevice.h>
38#include <linux/if_arp.h>
39#include <linux/delay.h>
40
41#include "rds.h"
42#include "iw.h"
43
44unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
45unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
46
47module_param(fastreg_pool_size, int, 0444);
48MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
49module_param(fastreg_message_size, int, 0444);
50MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
51
52struct list_head rds_iw_devices;
53
54DEFINE_SPINLOCK(iw_nodev_conns_lock);
55LIST_HEAD(iw_nodev_conns);
56
57void rds_iw_add_one(struct ib_device *device)
58{
59 struct rds_iw_device *rds_iwdev;
60 struct ib_device_attr *dev_attr;
61
62 /* Only handle iwarp devices */
63 if (device->node_type != RDMA_NODE_RNIC)
64 return;
65
66 dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
67 if (!dev_attr)
68 return;
69
70 if (ib_query_device(device, dev_attr)) {
71 rdsdebug("Query device failed for %s\n", device->name);
72 goto free_attr;
73 }
74
75 rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
76 if (!rds_iwdev)
77 goto free_attr;
78
79 spin_lock_init(&rds_iwdev->spinlock);
80
81 rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
82 rds_iwdev->max_wrs = dev_attr->max_qp_wr;
83 rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
84
85 rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
86
87 rds_iwdev->dev = device;
88 rds_iwdev->pd = ib_alloc_pd(device);
89 if (IS_ERR(rds_iwdev->pd))
90 goto free_dev;
91
92 if (!rds_iwdev->dma_local_lkey) {
93 if (device->node_type != RDMA_NODE_RNIC) {
94 rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
95 IB_ACCESS_LOCAL_WRITE);
96 } else {
97 rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
98 IB_ACCESS_REMOTE_READ |
99 IB_ACCESS_REMOTE_WRITE |
100 IB_ACCESS_LOCAL_WRITE);
101 }
102 if (IS_ERR(rds_iwdev->mr))
103 goto err_pd;
104 } else
105 rds_iwdev->mr = NULL;
106
107 rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
108 if (IS_ERR(rds_iwdev->mr_pool)) {
109 rds_iwdev->mr_pool = NULL;
110 goto err_mr;
111 }
112
113 INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
114 INIT_LIST_HEAD(&rds_iwdev->conn_list);
115 list_add_tail(&rds_iwdev->list, &rds_iw_devices);
116
117 ib_set_client_data(device, &rds_iw_client, rds_iwdev);
118
119 goto free_attr;
120
121err_mr:
122 if (rds_iwdev->mr)
123 ib_dereg_mr(rds_iwdev->mr);
124err_pd:
125 ib_dealloc_pd(rds_iwdev->pd);
126free_dev:
127 kfree(rds_iwdev);
128free_attr:
129 kfree(dev_attr);
130}
131
132void rds_iw_remove_one(struct ib_device *device)
133{
134 struct rds_iw_device *rds_iwdev;
135 struct rds_iw_cm_id *i_cm_id, *next;
136
137 rds_iwdev = ib_get_client_data(device, &rds_iw_client);
138 if (!rds_iwdev)
139 return;
140
141 spin_lock_irq(&rds_iwdev->spinlock);
142 list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
143 list_del(&i_cm_id->list);
144 kfree(i_cm_id);
145 }
146 spin_unlock_irq(&rds_iwdev->spinlock);
147
148 rds_iw_remove_conns(rds_iwdev);
149
150 if (rds_iwdev->mr_pool)
151 rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
152
153 if (rds_iwdev->mr)
154 ib_dereg_mr(rds_iwdev->mr);
155
156 while (ib_dealloc_pd(rds_iwdev->pd)) {
157 rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
158 msleep(1);
159 }
160
161 list_del(&rds_iwdev->list);
162 kfree(rds_iwdev);
163}
164
165struct ib_client rds_iw_client = {
166 .name = "rds_iw",
167 .add = rds_iw_add_one,
168 .remove = rds_iw_remove_one
169};
170
171static int rds_iw_conn_info_visitor(struct rds_connection *conn,
172 void *buffer)
173{
174 struct rds_info_rdma_connection *iinfo = buffer;
175 struct rds_iw_connection *ic;
176
177 /* We will only ever look at IB transports */
178 if (conn->c_trans != &rds_iw_transport)
179 return 0;
180
181 iinfo->src_addr = conn->c_laddr;
182 iinfo->dst_addr = conn->c_faddr;
183
184 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
185 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
186 if (rds_conn_state(conn) == RDS_CONN_UP) {
187 struct rds_iw_device *rds_iwdev;
188 struct rdma_dev_addr *dev_addr;
189
190 ic = conn->c_transport_data;
191 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
192
193 ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
194 ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
195
196 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
197 iinfo->max_send_wr = ic->i_send_ring.w_nr;
198 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
199 iinfo->max_send_sge = rds_iwdev->max_sge;
200 rds_iw_get_mr_info(rds_iwdev, iinfo);
201 }
202 return 1;
203}
204
205static void rds_iw_ic_info(struct socket *sock, unsigned int len,
206 struct rds_info_iterator *iter,
207 struct rds_info_lengths *lens)
208{
209 rds_for_each_conn_info(sock, len, iter, lens,
210 rds_iw_conn_info_visitor,
211 sizeof(struct rds_info_rdma_connection));
212}
213
214
215/*
216 * Early RDS/IB was built to only bind to an address if there is an IPoIB
217 * device with that address set.
218 *
219 * If it were me, I'd advocate for something more flexible. Sending and
220 * receiving should be device-agnostic. Transports would try and maintain
221 * connections between peers who have messages queued. Userspace would be
222 * allowed to influence which paths have priority. We could call userspace
223 * asserting this policy "routing".
224 */
225static int rds_iw_laddr_check(__be32 addr)
226{
227 int ret;
228 struct rdma_cm_id *cm_id;
229 struct sockaddr_in sin;
230
231 /* Create a CMA ID and try to bind it. This catches both
232 * IB and iWARP capable NICs.
233 */
234 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
235 if (!cm_id)
236 return -EADDRNOTAVAIL;
237
238 memset(&sin, 0, sizeof(sin));
239 sin.sin_family = AF_INET;
240 sin.sin_addr.s_addr = addr;
241
242 /* rdma_bind_addr will only succeed for IB & iWARP devices */
243 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
244 /* due to this, we will claim to support IB devices unless we
245 check node_type. */
246 if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
247 ret = -EADDRNOTAVAIL;
248
249 rdsdebug("addr %pI4 ret %d node type %d\n",
250 &addr, ret,
251 cm_id->device ? cm_id->device->node_type : -1);
252
253 rdma_destroy_id(cm_id);
254
255 return ret;
256}
257
258void rds_iw_exit(void)
259{
260 rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
261 rds_iw_remove_nodev_conns();
262 ib_unregister_client(&rds_iw_client);
263 rds_iw_sysctl_exit();
264 rds_iw_recv_exit();
265 rds_trans_unregister(&rds_iw_transport);
266}
267
268struct rds_transport rds_iw_transport = {
269 .laddr_check = rds_iw_laddr_check,
270 .xmit_complete = rds_iw_xmit_complete,
271 .xmit = rds_iw_xmit,
272 .xmit_cong_map = NULL,
273 .xmit_rdma = rds_iw_xmit_rdma,
274 .recv = rds_iw_recv,
275 .conn_alloc = rds_iw_conn_alloc,
276 .conn_free = rds_iw_conn_free,
277 .conn_connect = rds_iw_conn_connect,
278 .conn_shutdown = rds_iw_conn_shutdown,
279 .inc_copy_to_user = rds_iw_inc_copy_to_user,
280 .inc_purge = rds_iw_inc_purge,
281 .inc_free = rds_iw_inc_free,
282 .cm_initiate_connect = rds_iw_cm_initiate_connect,
283 .cm_handle_connect = rds_iw_cm_handle_connect,
284 .cm_connect_complete = rds_iw_cm_connect_complete,
285 .stats_info_copy = rds_iw_stats_info_copy,
286 .exit = rds_iw_exit,
287 .get_mr = rds_iw_get_mr,
288 .sync_mr = rds_iw_sync_mr,
289 .free_mr = rds_iw_free_mr,
290 .flush_mrs = rds_iw_flush_mrs,
291 .t_owner = THIS_MODULE,
292 .t_name = "iwarp",
293 .t_prefer_loopback = 1,
294};
295
296int __init rds_iw_init(void)
297{
298 int ret;
299
300 INIT_LIST_HEAD(&rds_iw_devices);
301
302 ret = ib_register_client(&rds_iw_client);
303 if (ret)
304 goto out;
305
306 ret = rds_iw_sysctl_init();
307 if (ret)
308 goto out_ibreg;
309
310 ret = rds_iw_recv_init();
311 if (ret)
312 goto out_sysctl;
313
314 ret = rds_trans_register(&rds_iw_transport);
315 if (ret)
316 goto out_recv;
317
318 rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
319
320 goto out;
321
322out_recv:
323 rds_iw_recv_exit();
324out_sysctl:
325 rds_iw_sysctl_exit();
326out_ibreg:
327 ib_unregister_client(&rds_iw_client);
328out:
329 return ret;
330}
331
332MODULE_LICENSE("GPL");
333
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 000000000000..0ddda34f2a1c
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,395 @@
1#ifndef _RDS_IW_H
2#define _RDS_IW_H
3
4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h>
6#include "rds.h"
7#include "rdma_transport.h"
8
9#define RDS_FASTREG_SIZE 20
10#define RDS_FASTREG_POOL_SIZE 2048
11
12#define RDS_IW_MAX_SGE 8
13#define RDS_IW_RECV_SGE 2
14
15#define RDS_IW_DEFAULT_RECV_WR 1024
16#define RDS_IW_DEFAULT_SEND_WR 256
17
18#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
19
20extern struct list_head rds_iw_devices;
21
22/*
23 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
24 * try and minimize the amount of memory tied up both the device and
25 * socket receive queues.
26 */
27/* page offset of the final full frag that fits in the page */
28#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
29struct rds_page_frag {
30 struct list_head f_item;
31 struct page *f_page;
32 unsigned long f_offset;
33 dma_addr_t f_mapped;
34};
35
36struct rds_iw_incoming {
37 struct list_head ii_frags;
38 struct rds_incoming ii_inc;
39};
40
41struct rds_iw_connect_private {
42 /* Add new fields at the end, and don't permute existing fields. */
43 __be32 dp_saddr;
44 __be32 dp_daddr;
45 u8 dp_protocol_major;
46 u8 dp_protocol_minor;
47 __be16 dp_protocol_minor_mask; /* bitmask */
48 __be32 dp_reserved1;
49 __be64 dp_ack_seq;
50 __be32 dp_credit; /* non-zero enables flow ctl */
51};
52
53struct rds_iw_scatterlist {
54 struct scatterlist *list;
55 unsigned int len;
56 int dma_len;
57 unsigned int dma_npages;
58 unsigned int bytes;
59};
60
61struct rds_iw_mapping {
62 spinlock_t m_lock; /* protect the mapping struct */
63 struct list_head m_list;
64 struct rds_iw_mr *m_mr;
65 uint32_t m_rkey;
66 struct rds_iw_scatterlist m_sg;
67};
68
69struct rds_iw_send_work {
70 struct rds_message *s_rm;
71
72 /* We should really put these into a union: */
73 struct rds_rdma_op *s_op;
74 struct rds_iw_mapping *s_mapping;
75 struct ib_mr *s_mr;
76 struct ib_fast_reg_page_list *s_page_list;
77 unsigned char s_remap_count;
78
79 struct ib_send_wr s_wr;
80 struct ib_sge s_sge[RDS_IW_MAX_SGE];
81 unsigned long s_queued;
82};
83
84struct rds_iw_recv_work {
85 struct rds_iw_incoming *r_iwinc;
86 struct rds_page_frag *r_frag;
87 struct ib_recv_wr r_wr;
88 struct ib_sge r_sge[2];
89};
90
91struct rds_iw_work_ring {
92 u32 w_nr;
93 u32 w_alloc_ptr;
94 u32 w_alloc_ctr;
95 u32 w_free_ptr;
96 atomic_t w_free_ctr;
97};
98
99struct rds_iw_device;
100
101struct rds_iw_connection {
102
103 struct list_head iw_node;
104 struct rds_iw_device *rds_iwdev;
105 struct rds_connection *conn;
106
107 /* alphabet soup, IBTA style */
108 struct rdma_cm_id *i_cm_id;
109 struct ib_pd *i_pd;
110 struct ib_mr *i_mr;
111 struct ib_cq *i_send_cq;
112 struct ib_cq *i_recv_cq;
113
114 /* tx */
115 struct rds_iw_work_ring i_send_ring;
116 struct rds_message *i_rm;
117 struct rds_header *i_send_hdrs;
118 u64 i_send_hdrs_dma;
119 struct rds_iw_send_work *i_sends;
120
121 /* rx */
122 struct mutex i_recv_mutex;
123 struct rds_iw_work_ring i_recv_ring;
124 struct rds_iw_incoming *i_iwinc;
125 u32 i_recv_data_rem;
126 struct rds_header *i_recv_hdrs;
127 u64 i_recv_hdrs_dma;
128 struct rds_iw_recv_work *i_recvs;
129 struct rds_page_frag i_frag;
130 u64 i_ack_recv; /* last ACK received */
131
132 /* sending acks */
133 unsigned long i_ack_flags;
134 u64 i_ack_next; /* next ACK to send */
135 struct rds_header *i_ack;
136 struct ib_send_wr i_ack_wr;
137 struct ib_sge i_ack_sge;
138 u64 i_ack_dma;
139 unsigned long i_ack_queued;
140
141 /* Flow control related information
142 *
143 * Our algorithm uses a pair variables that we need to access
144 * atomically - one for the send credits, and one posted
145 * recv credits we need to transfer to remote.
146 * Rather than protect them using a slow spinlock, we put both into
147 * a single atomic_t and update it using cmpxchg
148 */
149 atomic_t i_credits;
150
151 /* Protocol version specific information */
152 unsigned int i_flowctl:1; /* enable/disable flow ctl */
153 unsigned int i_dma_local_lkey:1;
154 unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
155 /* Batched completions */
156 unsigned int i_unsignaled_wrs;
157 long i_unsignaled_bytes;
158};
159
160/* This assumes that atomic_t is at least 32 bits */
161#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
162#define IB_GET_POST_CREDITS(v) ((v) >> 16)
163#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
164#define IB_SET_POST_CREDITS(v) ((v) << 16)
165
166struct rds_iw_cm_id {
167 struct list_head list;
168 struct rdma_cm_id *cm_id;
169};
170
171struct rds_iw_device {
172 struct list_head list;
173 struct list_head cm_id_list;
174 struct list_head conn_list;
175 struct ib_device *dev;
176 struct ib_pd *pd;
177 struct ib_mr *mr;
178 struct rds_iw_mr_pool *mr_pool;
179 int page_shift;
180 int max_sge;
181 unsigned int max_wrs;
182 unsigned int dma_local_lkey:1;
183 spinlock_t spinlock; /* protect the above */
184};
185
186/* bits for i_ack_flags */
187#define IB_ACK_IN_FLIGHT 0
188#define IB_ACK_REQUESTED 1
189
190/* Magic WR_ID for ACKs */
191#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
192#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
193#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
194
195struct rds_iw_statistics {
196 uint64_t s_iw_connect_raced;
197 uint64_t s_iw_listen_closed_stale;
198 uint64_t s_iw_tx_cq_call;
199 uint64_t s_iw_tx_cq_event;
200 uint64_t s_iw_tx_ring_full;
201 uint64_t s_iw_tx_throttle;
202 uint64_t s_iw_tx_sg_mapping_failure;
203 uint64_t s_iw_tx_stalled;
204 uint64_t s_iw_tx_credit_updates;
205 uint64_t s_iw_rx_cq_call;
206 uint64_t s_iw_rx_cq_event;
207 uint64_t s_iw_rx_ring_empty;
208 uint64_t s_iw_rx_refill_from_cq;
209 uint64_t s_iw_rx_refill_from_thread;
210 uint64_t s_iw_rx_alloc_limit;
211 uint64_t s_iw_rx_credit_updates;
212 uint64_t s_iw_ack_sent;
213 uint64_t s_iw_ack_send_failure;
214 uint64_t s_iw_ack_send_delayed;
215 uint64_t s_iw_ack_send_piggybacked;
216 uint64_t s_iw_ack_received;
217 uint64_t s_iw_rdma_mr_alloc;
218 uint64_t s_iw_rdma_mr_free;
219 uint64_t s_iw_rdma_mr_used;
220 uint64_t s_iw_rdma_mr_pool_flush;
221 uint64_t s_iw_rdma_mr_pool_wait;
222 uint64_t s_iw_rdma_mr_pool_depleted;
223};
224
225extern struct workqueue_struct *rds_iw_wq;
226
227/*
228 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
229 * doesn't define it.
230 */
231static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
232 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
233{
234 unsigned int i;
235
236 for (i = 0; i < sg_dma_len; ++i) {
237 ib_dma_sync_single_for_cpu(dev,
238 ib_sg_dma_address(dev, &sg[i]),
239 ib_sg_dma_len(dev, &sg[i]),
240 direction);
241 }
242}
243#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
244
245static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
246 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
247{
248 unsigned int i;
249
250 for (i = 0; i < sg_dma_len; ++i) {
251 ib_dma_sync_single_for_device(dev,
252 ib_sg_dma_address(dev, &sg[i]),
253 ib_sg_dma_len(dev, &sg[i]),
254 direction);
255 }
256}
257#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
258
259static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
260{
261 return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
262}
263
264/* ib.c */
265extern struct rds_transport rds_iw_transport;
266extern void rds_iw_add_one(struct ib_device *device);
267extern void rds_iw_remove_one(struct ib_device *device);
268extern struct ib_client rds_iw_client;
269
270extern unsigned int fastreg_pool_size;
271extern unsigned int fastreg_message_size;
272
273extern spinlock_t iw_nodev_conns_lock;
274extern struct list_head iw_nodev_conns;
275
276/* ib_cm.c */
277int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
278void rds_iw_conn_free(void *arg);
279int rds_iw_conn_connect(struct rds_connection *conn);
280void rds_iw_conn_shutdown(struct rds_connection *conn);
281void rds_iw_state_change(struct sock *sk);
282int __init rds_iw_listen_init(void);
283void rds_iw_listen_stop(void);
284void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
285int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
286 struct rdma_cm_event *event);
287int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
288void rds_iw_cm_connect_complete(struct rds_connection *conn,
289 struct rdma_cm_event *event);
290
291
292#define rds_iw_conn_error(conn, fmt...) \
293 __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
294
295/* ib_rdma.c */
296int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
297int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
298void rds_iw_remove_nodev_conns(void);
299void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev);
300struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
301void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
302void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
303void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
304 struct rds_sock *rs, u32 *key_ret);
305void rds_iw_sync_mr(void *trans_private, int dir);
306void rds_iw_free_mr(void *trans_private, int invalidate);
307void rds_iw_flush_mrs(void);
308void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
309
310/* ib_recv.c */
311int __init rds_iw_recv_init(void);
312void rds_iw_recv_exit(void);
313int rds_iw_recv(struct rds_connection *conn);
314int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
315 gfp_t page_gfp, int prefill);
316void rds_iw_inc_purge(struct rds_incoming *inc);
317void rds_iw_inc_free(struct rds_incoming *inc);
318int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
319 size_t size);
320void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
321void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
322void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
323void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
324void rds_iw_attempt_ack(struct rds_iw_connection *ic);
325void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
326u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
327
328/* ib_ring.c */
329void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
330void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
331u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
332void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
333void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
334int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
335int rds_iw_ring_low(struct rds_iw_work_ring *ring);
336u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
337u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
338extern wait_queue_head_t rds_iw_ring_empty_wait;
339
340/* ib_send.c */
341void rds_iw_xmit_complete(struct rds_connection *conn);
342int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
343 unsigned int hdr_off, unsigned int sg, unsigned int off);
344void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
345void rds_iw_send_init_ring(struct rds_iw_connection *ic);
346void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
347int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
348void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
349void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
350int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
351 u32 *adv_credits, int need_posted);
352
353/* ib_stats.c */
354DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
355#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
356unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
357 unsigned int avail);
358
359/* ib_sysctl.c */
360int __init rds_iw_sysctl_init(void);
361void rds_iw_sysctl_exit(void);
362extern unsigned long rds_iw_sysctl_max_send_wr;
363extern unsigned long rds_iw_sysctl_max_recv_wr;
364extern unsigned long rds_iw_sysctl_max_unsig_wrs;
365extern unsigned long rds_iw_sysctl_max_unsig_bytes;
366extern unsigned long rds_iw_sysctl_max_recv_allocation;
367extern unsigned int rds_iw_sysctl_flow_control;
368extern ctl_table rds_iw_sysctl_table[];
369
370/*
371 * Helper functions for getting/setting the header and data SGEs in
372 * RDS packets (not RDMA)
373 */
374static inline struct ib_sge *
375rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
376{
377 return &sge[0];
378}
379
380static inline struct ib_sge *
381rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
382{
383 return &sge[1];
384}
385
386static inline void rds_iw_set_64bit(u64 *ptr, u64 val)
387{
388#if BITS_PER_LONG == 64
389 *ptr = val;
390#else
391 set_64bit(ptr, val);
392#endif
393}
394
395#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 000000000000..57ecb3d4b8a5
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,750 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/vmalloc.h>
36
37#include "rds.h"
38#include "iw.h"
39
40/*
41 * Set the selected protocol version
42 */
43static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
44{
45 conn->c_version = version;
46}
47
48/*
49 * Set up flow control
50 */
51static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
52{
53 struct rds_iw_connection *ic = conn->c_transport_data;
54
55 if (rds_iw_sysctl_flow_control && credits != 0) {
56 /* We're doing flow control */
57 ic->i_flowctl = 1;
58 rds_iw_send_add_credits(conn, credits);
59 } else {
60 ic->i_flowctl = 0;
61 }
62}
63
64/*
65 * Connection established.
66 * We get here for both outgoing and incoming connection.
67 */
68void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
69{
70 const struct rds_iw_connect_private *dp = NULL;
71 struct rds_iw_connection *ic = conn->c_transport_data;
72 struct rds_iw_device *rds_iwdev;
73 int err;
74
75 if (event->param.conn.private_data_len) {
76 dp = event->param.conn.private_data;
77
78 rds_iw_set_protocol(conn,
79 RDS_PROTOCOL(dp->dp_protocol_major,
80 dp->dp_protocol_minor));
81 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
82 }
83
84 /* update ib_device with this local ipaddr & conn */
85 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
86 err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
87 if (err)
88 printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
89 err = rds_iw_add_conn(rds_iwdev, conn);
90 if (err)
91 printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err);
92
93 /* If the peer gave us the last packet it saw, process this as if
94 * we had received a regular ACK. */
95 if (dp && dp->dp_ack_seq)
96 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
97
98 printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
99 &conn->c_laddr, &conn->c_faddr,
100 RDS_PROTOCOL_MAJOR(conn->c_version),
101 RDS_PROTOCOL_MINOR(conn->c_version),
102 ic->i_flowctl ? ", flow control" : "");
103
104 rds_connect_complete(conn);
105}
106
107static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
108 struct rdma_conn_param *conn_param,
109 struct rds_iw_connect_private *dp,
110 u32 protocol_version)
111{
112 struct rds_iw_connection *ic = conn->c_transport_data;
113
114 memset(conn_param, 0, sizeof(struct rdma_conn_param));
115 /* XXX tune these? */
116 conn_param->responder_resources = 1;
117 conn_param->initiator_depth = 1;
118
119 if (dp) {
120 memset(dp, 0, sizeof(*dp));
121 dp->dp_saddr = conn->c_laddr;
122 dp->dp_daddr = conn->c_faddr;
123 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
124 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
125 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
126 dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
127
128 /* Advertise flow control */
129 if (ic->i_flowctl) {
130 unsigned int credits;
131
132 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
133 dp->dp_credit = cpu_to_be32(credits);
134 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
135 }
136
137 conn_param->private_data = dp;
138 conn_param->private_data_len = sizeof(*dp);
139 }
140}
141
142static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
143{
144 rdsdebug("event %u data %p\n", event->event, data);
145}
146
147static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
148{
149 struct rds_connection *conn = data;
150 struct rds_iw_connection *ic = conn->c_transport_data;
151
152 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
153
154 switch (event->event) {
155 case IB_EVENT_COMM_EST:
156 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
157 break;
158 case IB_EVENT_QP_REQ_ERR:
159 case IB_EVENT_QP_FATAL:
160 default:
161 rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
162 event->event, &conn->c_laddr,
163 &conn->c_faddr);
164 break;
165 }
166}
167
168/*
169 * Create a QP
170 */
171static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
172 struct rds_iw_device *rds_iwdev,
173 struct rds_iw_work_ring *send_ring,
174 void (*send_cq_handler)(struct ib_cq *, void *),
175 struct rds_iw_work_ring *recv_ring,
176 void (*recv_cq_handler)(struct ib_cq *, void *),
177 void *context)
178{
179 struct ib_device *dev = rds_iwdev->dev;
180 unsigned int send_size, recv_size;
181 int ret;
182
183 /* The offset of 1 is to accomodate the additional ACK WR. */
184 send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
185 recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
186 rds_iw_ring_resize(send_ring, send_size - 1);
187 rds_iw_ring_resize(recv_ring, recv_size - 1);
188
189 memset(attr, 0, sizeof(*attr));
190 attr->event_handler = rds_iw_qp_event_handler;
191 attr->qp_context = context;
192 attr->cap.max_send_wr = send_size;
193 attr->cap.max_recv_wr = recv_size;
194 attr->cap.max_send_sge = rds_iwdev->max_sge;
195 attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
196 attr->sq_sig_type = IB_SIGNAL_REQ_WR;
197 attr->qp_type = IB_QPT_RC;
198
199 attr->send_cq = ib_create_cq(dev, send_cq_handler,
200 rds_iw_cq_event_handler,
201 context, send_size, 0);
202 if (IS_ERR(attr->send_cq)) {
203 ret = PTR_ERR(attr->send_cq);
204 attr->send_cq = NULL;
205 rdsdebug("ib_create_cq send failed: %d\n", ret);
206 goto out;
207 }
208
209 attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
210 rds_iw_cq_event_handler,
211 context, recv_size, 0);
212 if (IS_ERR(attr->recv_cq)) {
213 ret = PTR_ERR(attr->recv_cq);
214 attr->recv_cq = NULL;
215 rdsdebug("ib_create_cq send failed: %d\n", ret);
216 goto out;
217 }
218
219 ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
220 if (ret) {
221 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
222 goto out;
223 }
224
225 ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
226 if (ret) {
227 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
228 goto out;
229 }
230
231out:
232 if (ret) {
233 if (attr->send_cq)
234 ib_destroy_cq(attr->send_cq);
235 if (attr->recv_cq)
236 ib_destroy_cq(attr->recv_cq);
237 }
238 return ret;
239}
240
241/*
242 * This needs to be very careful to not leave IS_ERR pointers around for
243 * cleanup to trip over.
244 */
245static int rds_iw_setup_qp(struct rds_connection *conn)
246{
247 struct rds_iw_connection *ic = conn->c_transport_data;
248 struct ib_device *dev = ic->i_cm_id->device;
249 struct ib_qp_init_attr attr;
250 struct rds_iw_device *rds_iwdev;
251 int ret;
252
253 /* rds_iw_add_one creates a rds_iw_device object per IB device,
254 * and allocates a protection domain, memory range and MR pool
255 * for each. If that fails for any reason, it will not register
256 * the rds_iwdev at all.
257 */
258 rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
259 if (rds_iwdev == NULL) {
260 if (printk_ratelimit())
261 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
262 dev->name);
263 return -EOPNOTSUPP;
264 }
265
266 /* Protection domain and memory range */
267 ic->i_pd = rds_iwdev->pd;
268 ic->i_mr = rds_iwdev->mr;
269
270 ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
271 &ic->i_send_ring, rds_iw_send_cq_comp_handler,
272 &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
273 conn);
274 if (ret < 0)
275 goto out;
276
277 ic->i_send_cq = attr.send_cq;
278 ic->i_recv_cq = attr.recv_cq;
279
280 /*
281 * XXX this can fail if max_*_wr is too large? Are we supposed
282 * to back off until we get a value that the hardware can support?
283 */
284 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
285 if (ret) {
286 rdsdebug("rdma_create_qp failed: %d\n", ret);
287 goto out;
288 }
289
290 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
291 ic->i_send_ring.w_nr *
292 sizeof(struct rds_header),
293 &ic->i_send_hdrs_dma, GFP_KERNEL);
294 if (ic->i_send_hdrs == NULL) {
295 ret = -ENOMEM;
296 rdsdebug("ib_dma_alloc_coherent send failed\n");
297 goto out;
298 }
299
300 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
301 ic->i_recv_ring.w_nr *
302 sizeof(struct rds_header),
303 &ic->i_recv_hdrs_dma, GFP_KERNEL);
304 if (ic->i_recv_hdrs == NULL) {
305 ret = -ENOMEM;
306 rdsdebug("ib_dma_alloc_coherent recv failed\n");
307 goto out;
308 }
309
310 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
311 &ic->i_ack_dma, GFP_KERNEL);
312 if (ic->i_ack == NULL) {
313 ret = -ENOMEM;
314 rdsdebug("ib_dma_alloc_coherent ack failed\n");
315 goto out;
316 }
317
318 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
319 if (ic->i_sends == NULL) {
320 ret = -ENOMEM;
321 rdsdebug("send allocation failed\n");
322 goto out;
323 }
324 rds_iw_send_init_ring(ic);
325
326 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
327 if (ic->i_recvs == NULL) {
328 ret = -ENOMEM;
329 rdsdebug("recv allocation failed\n");
330 goto out;
331 }
332
333 rds_iw_recv_init_ring(ic);
334 rds_iw_recv_init_ack(ic);
335
336 /* Post receive buffers - as a side effect, this will update
337 * the posted credit count. */
338 rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
339
340 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
341 ic->i_send_cq, ic->i_recv_cq);
342
343out:
344 return ret;
345}
346
347static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
348{
349 u16 common;
350 u32 version = 0;
351
352 /* rdma_cm private data is odd - when there is any private data in the
353 * request, we will be given a pretty large buffer without telling us the
354 * original size. The only way to tell the difference is by looking at
355 * the contents, which are initialized to zero.
356 * If the protocol version fields aren't set, this is a connection attempt
357 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
358 * We really should have changed this for OFED 1.3 :-( */
359 if (dp->dp_protocol_major == 0)
360 return RDS_PROTOCOL_3_0;
361
362 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
363 if (dp->dp_protocol_major == 3 && common) {
364 version = RDS_PROTOCOL_3_0;
365 while ((common >>= 1) != 0)
366 version++;
367 } else if (printk_ratelimit()) {
368 printk(KERN_NOTICE "RDS: Connection from %pI4 using "
369 "incompatible protocol version %u.%u\n",
370 &dp->dp_saddr,
371 dp->dp_protocol_major,
372 dp->dp_protocol_minor);
373 }
374 return version;
375}
376
377int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
378 struct rdma_cm_event *event)
379{
380 const struct rds_iw_connect_private *dp = event->param.conn.private_data;
381 struct rds_iw_connect_private dp_rep;
382 struct rds_connection *conn = NULL;
383 struct rds_iw_connection *ic = NULL;
384 struct rdma_conn_param conn_param;
385 struct rds_iw_device *rds_iwdev;
386 u32 version;
387 int err, destroy = 1;
388
389 /* Check whether the remote protocol version matches ours. */
390 version = rds_iw_protocol_compatible(dp);
391 if (!version)
392 goto out;
393
394 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
395 &dp->dp_saddr, &dp->dp_daddr,
396 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
397
398 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
399 GFP_KERNEL);
400 if (IS_ERR(conn)) {
401 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
402 conn = NULL;
403 goto out;
404 }
405
406 /*
407 * The connection request may occur while the
408 * previous connection exist, e.g. in case of failover.
409 * But as connections may be initiated simultaneously
410 * by both hosts, we have a random backoff mechanism -
411 * see the comment above rds_queue_reconnect()
412 */
413 mutex_lock(&conn->c_cm_lock);
414 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
415 if (rds_conn_state(conn) == RDS_CONN_UP) {
416 rdsdebug("incoming connect while connecting\n");
417 rds_conn_drop(conn);
418 rds_iw_stats_inc(s_iw_listen_closed_stale);
419 } else
420 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
421 /* Wait and see - our connect may still be succeeding */
422 rds_iw_stats_inc(s_iw_connect_raced);
423 }
424 mutex_unlock(&conn->c_cm_lock);
425 goto out;
426 }
427
428 ic = conn->c_transport_data;
429
430 rds_iw_set_protocol(conn, version);
431 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
432
433 /* If the peer gave us the last packet it saw, process this as if
434 * we had received a regular ACK. */
435 if (dp->dp_ack_seq)
436 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
437
438 BUG_ON(cm_id->context);
439 BUG_ON(ic->i_cm_id);
440
441 ic->i_cm_id = cm_id;
442 cm_id->context = conn;
443
444 rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
445 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
446
447 /* We got halfway through setting up the ib_connection, if we
448 * fail now, we have to take the long route out of this mess. */
449 destroy = 0;
450
451 err = rds_iw_setup_qp(conn);
452 if (err) {
453 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
454 goto out;
455 }
456
457 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
458
459 /* rdma_accept() calls rdma_reject() internally if it fails */
460 err = rdma_accept(cm_id, &conn_param);
461 mutex_unlock(&conn->c_cm_lock);
462 if (err) {
463 rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
464 goto out;
465 }
466
467 return 0;
468
469out:
470 rdma_reject(cm_id, NULL, 0);
471 return destroy;
472}
473
474
475int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
476{
477 struct rds_connection *conn = cm_id->context;
478 struct rds_iw_connection *ic = conn->c_transport_data;
479 struct rdma_conn_param conn_param;
480 struct rds_iw_connect_private dp;
481 int ret;
482
483 /* If the peer doesn't do protocol negotiation, we must
484 * default to RDSv3.0 */
485 rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
486 ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
487
488 ret = rds_iw_setup_qp(conn);
489 if (ret) {
490 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
491 goto out;
492 }
493
494 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
495
496 ret = rdma_connect(cm_id, &conn_param);
497 if (ret)
498 rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
499
500out:
501 /* Beware - returning non-zero tells the rdma_cm to destroy
502 * the cm_id. We should certainly not do it as long as we still
503 * "own" the cm_id. */
504 if (ret) {
505 struct rds_iw_connection *ic = conn->c_transport_data;
506
507 if (ic->i_cm_id == cm_id)
508 ret = 0;
509 }
510 return ret;
511}
512
513int rds_iw_conn_connect(struct rds_connection *conn)
514{
515 struct rds_iw_connection *ic = conn->c_transport_data;
516 struct rds_iw_device *rds_iwdev;
517 struct sockaddr_in src, dest;
518 int ret;
519
520 /* XXX I wonder what affect the port space has */
521 /* delegate cm event handler to rdma_transport */
522 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
523 RDMA_PS_TCP);
524 if (IS_ERR(ic->i_cm_id)) {
525 ret = PTR_ERR(ic->i_cm_id);
526 ic->i_cm_id = NULL;
527 rdsdebug("rdma_create_id() failed: %d\n", ret);
528 goto out;
529 }
530
531 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
532
533 src.sin_family = AF_INET;
534 src.sin_addr.s_addr = (__force u32)conn->c_laddr;
535 src.sin_port = (__force u16)htons(0);
536
537 /* First, bind to the local address and device. */
538 ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
539 if (ret) {
540 rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
541 &conn->c_laddr, ret);
542 rdma_destroy_id(ic->i_cm_id);
543 ic->i_cm_id = NULL;
544 goto out;
545 }
546
547 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
548 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
549
550 dest.sin_family = AF_INET;
551 dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
552 dest.sin_port = (__force u16)htons(RDS_PORT);
553
554 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
555 (struct sockaddr *)&dest,
556 RDS_RDMA_RESOLVE_TIMEOUT_MS);
557 if (ret) {
558 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
559 ret);
560 rdma_destroy_id(ic->i_cm_id);
561 ic->i_cm_id = NULL;
562 }
563
564out:
565 return ret;
566}
567
568/*
569 * This is so careful about only cleaning up resources that were built up
570 * so that it can be called at any point during startup. In fact it
571 * can be called multiple times for a given connection.
572 */
573void rds_iw_conn_shutdown(struct rds_connection *conn)
574{
575 struct rds_iw_connection *ic = conn->c_transport_data;
576 int err = 0;
577 struct ib_qp_attr qp_attr;
578
579 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
580 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
581 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
582
583 if (ic->i_cm_id) {
584 struct ib_device *dev = ic->i_cm_id->device;
585
586 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
587 err = rdma_disconnect(ic->i_cm_id);
588 if (err) {
589 /* Actually this may happen quite frequently, when
590 * an outgoing connect raced with an incoming connect.
591 */
592 rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
593 " cm: %p err %d\n", ic->i_cm_id, err);
594 }
595
596 if (ic->i_cm_id->qp) {
597 qp_attr.qp_state = IB_QPS_ERR;
598 ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
599 }
600
601 wait_event(rds_iw_ring_empty_wait,
602 rds_iw_ring_empty(&ic->i_send_ring) &&
603 rds_iw_ring_empty(&ic->i_recv_ring));
604
605 if (ic->i_send_hdrs)
606 ib_dma_free_coherent(dev,
607 ic->i_send_ring.w_nr *
608 sizeof(struct rds_header),
609 ic->i_send_hdrs,
610 ic->i_send_hdrs_dma);
611
612 if (ic->i_recv_hdrs)
613 ib_dma_free_coherent(dev,
614 ic->i_recv_ring.w_nr *
615 sizeof(struct rds_header),
616 ic->i_recv_hdrs,
617 ic->i_recv_hdrs_dma);
618
619 if (ic->i_ack)
620 ib_dma_free_coherent(dev, sizeof(struct rds_header),
621 ic->i_ack, ic->i_ack_dma);
622
623 if (ic->i_sends)
624 rds_iw_send_clear_ring(ic);
625 if (ic->i_recvs)
626 rds_iw_recv_clear_ring(ic);
627
628 if (ic->i_cm_id->qp)
629 rdma_destroy_qp(ic->i_cm_id);
630 if (ic->i_send_cq)
631 ib_destroy_cq(ic->i_send_cq);
632 if (ic->i_recv_cq)
633 ib_destroy_cq(ic->i_recv_cq);
634
635 /*
636 * If associated with an rds_iw_device:
637 * Move connection back to the nodev list.
638 * Remove cm_id from the device cm_id list.
639 */
640 if (ic->rds_iwdev) {
641
642 spin_lock_irq(&ic->rds_iwdev->spinlock);
643 BUG_ON(list_empty(&ic->iw_node));
644 list_del(&ic->iw_node);
645 spin_unlock_irq(&ic->rds_iwdev->spinlock);
646
647 spin_lock_irq(&iw_nodev_conns_lock);
648 list_add_tail(&ic->iw_node, &iw_nodev_conns);
649 spin_unlock_irq(&iw_nodev_conns_lock);
650 rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
651 ic->rds_iwdev = NULL;
652 }
653
654 rdma_destroy_id(ic->i_cm_id);
655
656 ic->i_cm_id = NULL;
657 ic->i_pd = NULL;
658 ic->i_mr = NULL;
659 ic->i_send_cq = NULL;
660 ic->i_recv_cq = NULL;
661 ic->i_send_hdrs = NULL;
662 ic->i_recv_hdrs = NULL;
663 ic->i_ack = NULL;
664 }
665 BUG_ON(ic->rds_iwdev);
666
667 /* Clear pending transmit */
668 if (ic->i_rm) {
669 rds_message_put(ic->i_rm);
670 ic->i_rm = NULL;
671 }
672
673 /* Clear the ACK state */
674 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
675 rds_iw_set_64bit(&ic->i_ack_next, 0);
676 ic->i_ack_recv = 0;
677
678 /* Clear flow control state */
679 ic->i_flowctl = 0;
680 atomic_set(&ic->i_credits, 0);
681
682 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
683 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
684
685 if (ic->i_iwinc) {
686 rds_inc_put(&ic->i_iwinc->ii_inc);
687 ic->i_iwinc = NULL;
688 }
689
690 vfree(ic->i_sends);
691 ic->i_sends = NULL;
692 vfree(ic->i_recvs);
693 ic->i_recvs = NULL;
694 rdsdebug("shutdown complete\n");
695}
696
697int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
698{
699 struct rds_iw_connection *ic;
700 unsigned long flags;
701
702 /* XXX too lazy? */
703 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
704 if (ic == NULL)
705 return -ENOMEM;
706
707 INIT_LIST_HEAD(&ic->iw_node);
708 mutex_init(&ic->i_recv_mutex);
709
710 /*
711 * rds_iw_conn_shutdown() waits for these to be emptied so they
712 * must be initialized before it can be called.
713 */
714 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
715 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
716
717 ic->conn = conn;
718 conn->c_transport_data = ic;
719
720 spin_lock_irqsave(&iw_nodev_conns_lock, flags);
721 list_add_tail(&ic->iw_node, &iw_nodev_conns);
722 spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
723
724
725 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
726 return 0;
727}
728
729void rds_iw_conn_free(void *arg)
730{
731 struct rds_iw_connection *ic = arg;
732 rdsdebug("ic %p\n", ic);
733 list_del(&ic->iw_node);
734 kfree(ic);
735}
736
737/*
738 * An error occurred on the connection
739 */
740void
741__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
742{
743 va_list ap;
744
745 rds_conn_drop(conn);
746
747 va_start(ap, fmt);
748 vprintk(fmt, ap);
749 va_end(ap);
750}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 000000000000..1c02a8f952d0
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,888 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "rdma.h"
37#include "iw.h"
38
39
40/*
41 * This is stored as mr->r_trans_private.
42 */
43struct rds_iw_mr {
44 struct rds_iw_device *device;
45 struct rds_iw_mr_pool *pool;
46 struct rdma_cm_id *cm_id;
47
48 struct ib_mr *mr;
49 struct ib_fast_reg_page_list *page_list;
50
51 struct rds_iw_mapping mapping;
52 unsigned char remap_count;
53};
54
55/*
56 * Our own little MR pool
57 */
58struct rds_iw_mr_pool {
59 struct rds_iw_device *device; /* back ptr to the device that owns us */
60
61 struct mutex flush_lock; /* serialize fmr invalidate */
62 struct work_struct flush_worker; /* flush worker */
63
64 spinlock_t list_lock; /* protect variables below */
65 atomic_t item_count; /* total # of MRs */
66 atomic_t dirty_count; /* # dirty of MRs */
67 struct list_head dirty_list; /* dirty mappings */
68 struct list_head clean_list; /* unused & unamapped MRs */
69 atomic_t free_pinned; /* memory pinned by free MRs */
70 unsigned long max_message_size; /* in pages */
71 unsigned long max_items;
72 unsigned long max_items_soft;
73 unsigned long max_free_pinned;
74 int max_pages;
75};
76
77static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
78static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
79static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
80static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
81 struct rds_iw_mr *ibmr,
82 struct scatterlist *sg, unsigned int nents);
83static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
84static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
85 struct list_head *unmap_list,
86 struct list_head *kill_list);
87static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
88
89static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
90{
91 struct rds_iw_device *iwdev;
92 struct rds_iw_cm_id *i_cm_id;
93
94 *rds_iwdev = NULL;
95 *cm_id = NULL;
96
97 list_for_each_entry(iwdev, &rds_iw_devices, list) {
98 spin_lock_irq(&iwdev->spinlock);
99 list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
100 struct sockaddr_in *src_addr, *dst_addr;
101
102 src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
103 dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
104
105 rdsdebug("local ipaddr = %x port %d, "
106 "remote ipaddr = %x port %d"
107 "..looking for %x port %d, "
108 "remote ipaddr = %x port %d\n",
109 src_addr->sin_addr.s_addr,
110 src_addr->sin_port,
111 dst_addr->sin_addr.s_addr,
112 dst_addr->sin_port,
113 rs->rs_bound_addr,
114 rs->rs_bound_port,
115 rs->rs_conn_addr,
116 rs->rs_conn_port);
117#ifdef WORKING_TUPLE_DETECTION
118 if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
119 src_addr->sin_port == rs->rs_bound_port &&
120 dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
121 dst_addr->sin_port == rs->rs_conn_port) {
122#else
123 /* FIXME - needs to compare the local and remote
124 * ipaddr/port tuple, but the ipaddr is the only
125 * available infomation in the rds_sock (as the rest are
126 * zero'ed. It doesn't appear to be properly populated
127 * during connection setup...
128 */
129 if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
130#endif
131 spin_unlock_irq(&iwdev->spinlock);
132 *rds_iwdev = iwdev;
133 *cm_id = i_cm_id->cm_id;
134 return 0;
135 }
136 }
137 spin_unlock_irq(&iwdev->spinlock);
138 }
139
140 return 1;
141}
142
143static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
144{
145 struct rds_iw_cm_id *i_cm_id;
146
147 i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
148 if (!i_cm_id)
149 return -ENOMEM;
150
151 i_cm_id->cm_id = cm_id;
152
153 spin_lock_irq(&rds_iwdev->spinlock);
154 list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
155 spin_unlock_irq(&rds_iwdev->spinlock);
156
157 return 0;
158}
159
160void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
161{
162 struct rds_iw_cm_id *i_cm_id;
163
164 spin_lock_irq(&rds_iwdev->spinlock);
165 list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
166 if (i_cm_id->cm_id == cm_id) {
167 list_del(&i_cm_id->list);
168 kfree(i_cm_id);
169 break;
170 }
171 }
172 spin_unlock_irq(&rds_iwdev->spinlock);
173}
174
175
176int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
177{
178 struct sockaddr_in *src_addr, *dst_addr;
179 struct rds_iw_device *rds_iwdev_old;
180 struct rds_sock rs;
181 struct rdma_cm_id *pcm_id;
182 int rc;
183
184 src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
185 dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
186
187 rs.rs_bound_addr = src_addr->sin_addr.s_addr;
188 rs.rs_bound_port = src_addr->sin_port;
189 rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
190 rs.rs_conn_port = dst_addr->sin_port;
191
192 rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
193 if (rc)
194 rds_iw_remove_cm_id(rds_iwdev, cm_id);
195
196 return rds_iw_add_cm_id(rds_iwdev, cm_id);
197}
198
199int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
200{
201 struct rds_iw_connection *ic = conn->c_transport_data;
202
203 /* conn was previously on the nodev_conns_list */
204 spin_lock_irq(&iw_nodev_conns_lock);
205 BUG_ON(list_empty(&iw_nodev_conns));
206 BUG_ON(list_empty(&ic->iw_node));
207 list_del(&ic->iw_node);
208 spin_unlock_irq(&iw_nodev_conns_lock);
209
210 spin_lock_irq(&rds_iwdev->spinlock);
211 list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
212 spin_unlock_irq(&rds_iwdev->spinlock);
213
214 ic->rds_iwdev = rds_iwdev;
215
216 return 0;
217}
218
219void rds_iw_remove_nodev_conns(void)
220{
221 struct rds_iw_connection *ic, *_ic;
222 LIST_HEAD(tmp_list);
223
224 /* avoid calling conn_destroy with irqs off */
225 spin_lock_irq(&iw_nodev_conns_lock);
226 list_splice(&iw_nodev_conns, &tmp_list);
227 INIT_LIST_HEAD(&iw_nodev_conns);
228 spin_unlock_irq(&iw_nodev_conns_lock);
229
230 list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
231 if (ic->conn->c_passive)
232 rds_conn_destroy(ic->conn->c_passive);
233 rds_conn_destroy(ic->conn);
234 }
235}
236
237void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev)
238{
239 struct rds_iw_connection *ic, *_ic;
240 LIST_HEAD(tmp_list);
241
242 /* avoid calling conn_destroy with irqs off */
243 spin_lock_irq(&rds_iwdev->spinlock);
244 list_splice(&rds_iwdev->conn_list, &tmp_list);
245 INIT_LIST_HEAD(&rds_iwdev->conn_list);
246 spin_unlock_irq(&rds_iwdev->spinlock);
247
248 list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
249 if (ic->conn->c_passive)
250 rds_conn_destroy(ic->conn->c_passive);
251 rds_conn_destroy(ic->conn);
252 }
253}
254
255static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
256 struct scatterlist *list, unsigned int sg_len)
257{
258 sg->list = list;
259 sg->len = sg_len;
260 sg->dma_len = 0;
261 sg->dma_npages = 0;
262 sg->bytes = 0;
263}
264
265static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
266 struct rds_iw_scatterlist *sg,
267 unsigned int dma_page_shift)
268{
269 struct ib_device *dev = rds_iwdev->dev;
270 u64 *dma_pages = NULL;
271 u64 dma_mask;
272 unsigned int dma_page_size;
273 int i, j, ret;
274
275 dma_page_size = 1 << dma_page_shift;
276 dma_mask = dma_page_size - 1;
277
278 WARN_ON(sg->dma_len);
279
280 sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
281 if (unlikely(!sg->dma_len)) {
282 printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
283 return ERR_PTR(-EBUSY);
284 }
285
286 sg->bytes = 0;
287 sg->dma_npages = 0;
288
289 ret = -EINVAL;
290 for (i = 0; i < sg->dma_len; ++i) {
291 unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
292 u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
293 u64 end_addr;
294
295 sg->bytes += dma_len;
296
297 end_addr = dma_addr + dma_len;
298 if (dma_addr & dma_mask) {
299 if (i > 0)
300 goto out_unmap;
301 dma_addr &= ~dma_mask;
302 }
303 if (end_addr & dma_mask) {
304 if (i < sg->dma_len - 1)
305 goto out_unmap;
306 end_addr = (end_addr + dma_mask) & ~dma_mask;
307 }
308
309 sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
310 }
311
312 /* Now gather the dma addrs into one list */
313 if (sg->dma_npages > fastreg_message_size)
314 goto out_unmap;
315
316 dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
317 if (!dma_pages) {
318 ret = -ENOMEM;
319 goto out_unmap;
320 }
321
322 for (i = j = 0; i < sg->dma_len; ++i) {
323 unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
324 u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
325 u64 end_addr;
326
327 end_addr = dma_addr + dma_len;
328 dma_addr &= ~dma_mask;
329 for (; dma_addr < end_addr; dma_addr += dma_page_size)
330 dma_pages[j++] = dma_addr;
331 BUG_ON(j > sg->dma_npages);
332 }
333
334 return dma_pages;
335
336out_unmap:
337 ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
338 sg->dma_len = 0;
339 kfree(dma_pages);
340 return ERR_PTR(ret);
341}
342
343
344struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
345{
346 struct rds_iw_mr_pool *pool;
347
348 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
349 if (!pool) {
350 printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
351 return ERR_PTR(-ENOMEM);
352 }
353
354 pool->device = rds_iwdev;
355 INIT_LIST_HEAD(&pool->dirty_list);
356 INIT_LIST_HEAD(&pool->clean_list);
357 mutex_init(&pool->flush_lock);
358 spin_lock_init(&pool->list_lock);
359 INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
360
361 pool->max_message_size = fastreg_message_size;
362 pool->max_items = fastreg_pool_size;
363 pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
364 pool->max_pages = fastreg_message_size;
365
366 /* We never allow more than max_items MRs to be allocated.
367 * When we exceed more than max_items_soft, we start freeing
368 * items more aggressively.
369 * Make sure that max_items > max_items_soft > max_items / 2
370 */
371 pool->max_items_soft = pool->max_items * 3 / 4;
372
373 return pool;
374}
375
376void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
377{
378 struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
379
380 iinfo->rdma_mr_max = pool->max_items;
381 iinfo->rdma_mr_size = pool->max_pages;
382}
383
384void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
385{
386 flush_workqueue(rds_wq);
387 rds_iw_flush_mr_pool(pool, 1);
388 BUG_ON(atomic_read(&pool->item_count));
389 BUG_ON(atomic_read(&pool->free_pinned));
390 kfree(pool);
391}
392
393static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
394{
395 struct rds_iw_mr *ibmr = NULL;
396 unsigned long flags;
397
398 spin_lock_irqsave(&pool->list_lock, flags);
399 if (!list_empty(&pool->clean_list)) {
400 ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
401 list_del_init(&ibmr->mapping.m_list);
402 }
403 spin_unlock_irqrestore(&pool->list_lock, flags);
404
405 return ibmr;
406}
407
408static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
409{
410 struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
411 struct rds_iw_mr *ibmr = NULL;
412 int err = 0, iter = 0;
413
414 while (1) {
415 ibmr = rds_iw_reuse_fmr(pool);
416 if (ibmr)
417 return ibmr;
418
419 /* No clean MRs - now we have the choice of either
420 * allocating a fresh MR up to the limit imposed by the
421 * driver, or flush any dirty unused MRs.
422 * We try to avoid stalling in the send path if possible,
423 * so we allocate as long as we're allowed to.
424 *
425 * We're fussy with enforcing the FMR limit, though. If the driver
426 * tells us we can't use more than N fmrs, we shouldn't start
427 * arguing with it */
428 if (atomic_inc_return(&pool->item_count) <= pool->max_items)
429 break;
430
431 atomic_dec(&pool->item_count);
432
433 if (++iter > 2) {
434 rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
435 return ERR_PTR(-EAGAIN);
436 }
437
438 /* We do have some empty MRs. Flush them out. */
439 rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
440 rds_iw_flush_mr_pool(pool, 0);
441 }
442
443 ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
444 if (!ibmr) {
445 err = -ENOMEM;
446 goto out_no_cigar;
447 }
448
449 spin_lock_init(&ibmr->mapping.m_lock);
450 INIT_LIST_HEAD(&ibmr->mapping.m_list);
451 ibmr->mapping.m_mr = ibmr;
452
453 err = rds_iw_init_fastreg(pool, ibmr);
454 if (err)
455 goto out_no_cigar;
456
457 rds_iw_stats_inc(s_iw_rdma_mr_alloc);
458 return ibmr;
459
460out_no_cigar:
461 if (ibmr) {
462 rds_iw_destroy_fastreg(pool, ibmr);
463 kfree(ibmr);
464 }
465 atomic_dec(&pool->item_count);
466 return ERR_PTR(err);
467}
468
469void rds_iw_sync_mr(void *trans_private, int direction)
470{
471 struct rds_iw_mr *ibmr = trans_private;
472 struct rds_iw_device *rds_iwdev = ibmr->device;
473
474 switch (direction) {
475 case DMA_FROM_DEVICE:
476 ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
477 ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
478 break;
479 case DMA_TO_DEVICE:
480 ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
481 ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
482 break;
483 }
484}
485
486static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
487{
488 unsigned int item_count;
489
490 item_count = atomic_read(&pool->item_count);
491 if (free_all)
492 return item_count;
493
494 return 0;
495}
496
497/*
498 * Flush our pool of MRs.
499 * At a minimum, all currently unused MRs are unmapped.
500 * If the number of MRs allocated exceeds the limit, we also try
501 * to free as many MRs as needed to get back to this limit.
502 */
503static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
504{
505 struct rds_iw_mr *ibmr, *next;
506 LIST_HEAD(unmap_list);
507 LIST_HEAD(kill_list);
508 unsigned long flags;
509 unsigned int nfreed = 0, ncleaned = 0, free_goal;
510 int ret = 0;
511
512 rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
513
514 mutex_lock(&pool->flush_lock);
515
516 spin_lock_irqsave(&pool->list_lock, flags);
517 /* Get the list of all mappings to be destroyed */
518 list_splice_init(&pool->dirty_list, &unmap_list);
519 if (free_all)
520 list_splice_init(&pool->clean_list, &kill_list);
521 spin_unlock_irqrestore(&pool->list_lock, flags);
522
523 free_goal = rds_iw_flush_goal(pool, free_all);
524
525 /* Batched invalidate of dirty MRs.
526 * For FMR based MRs, the mappings on the unmap list are
527 * actually members of an ibmr (ibmr->mapping). They either
528 * migrate to the kill_list, or have been cleaned and should be
529 * moved to the clean_list.
530 * For fastregs, they will be dynamically allocated, and
531 * will be destroyed by the unmap function.
532 */
533 if (!list_empty(&unmap_list)) {
534 ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
535 /* If we've been asked to destroy all MRs, move those
536 * that were simply cleaned to the kill list */
537 if (free_all)
538 list_splice_init(&unmap_list, &kill_list);
539 }
540
541 /* Destroy any MRs that are past their best before date */
542 list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
543 rds_iw_stats_inc(s_iw_rdma_mr_free);
544 list_del(&ibmr->mapping.m_list);
545 rds_iw_destroy_fastreg(pool, ibmr);
546 kfree(ibmr);
547 nfreed++;
548 }
549
550 /* Anything that remains are laundered ibmrs, which we can add
551 * back to the clean list. */
552 if (!list_empty(&unmap_list)) {
553 spin_lock_irqsave(&pool->list_lock, flags);
554 list_splice(&unmap_list, &pool->clean_list);
555 spin_unlock_irqrestore(&pool->list_lock, flags);
556 }
557
558 atomic_sub(ncleaned, &pool->dirty_count);
559 atomic_sub(nfreed, &pool->item_count);
560
561 mutex_unlock(&pool->flush_lock);
562 return ret;
563}
564
565static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
566{
567 struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
568
569 rds_iw_flush_mr_pool(pool, 0);
570}
571
572void rds_iw_free_mr(void *trans_private, int invalidate)
573{
574 struct rds_iw_mr *ibmr = trans_private;
575 struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
576
577 rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
578 if (!pool)
579 return;
580
581 /* Return it to the pool's free list */
582 rds_iw_free_fastreg(pool, ibmr);
583
584 /* If we've pinned too many pages, request a flush */
585 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
586 || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
587 queue_work(rds_wq, &pool->flush_worker);
588
589 if (invalidate) {
590 if (likely(!in_interrupt())) {
591 rds_iw_flush_mr_pool(pool, 0);
592 } else {
593 /* We get here if the user created a MR marked
594 * as use_once and invalidate at the same time. */
595 queue_work(rds_wq, &pool->flush_worker);
596 }
597 }
598}
599
600void rds_iw_flush_mrs(void)
601{
602 struct rds_iw_device *rds_iwdev;
603
604 list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
605 struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
606
607 if (pool)
608 rds_iw_flush_mr_pool(pool, 0);
609 }
610}
611
612void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
613 struct rds_sock *rs, u32 *key_ret)
614{
615 struct rds_iw_device *rds_iwdev;
616 struct rds_iw_mr *ibmr = NULL;
617 struct rdma_cm_id *cm_id;
618 int ret;
619
620 ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
621 if (ret || !cm_id) {
622 ret = -ENODEV;
623 goto out;
624 }
625
626 if (!rds_iwdev->mr_pool) {
627 ret = -ENODEV;
628 goto out;
629 }
630
631 ibmr = rds_iw_alloc_mr(rds_iwdev);
632 if (IS_ERR(ibmr))
633 return ibmr;
634
635 ibmr->cm_id = cm_id;
636 ibmr->device = rds_iwdev;
637
638 ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
639 if (ret == 0)
640 *key_ret = ibmr->mr->rkey;
641 else
642 printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
643
644out:
645 if (ret) {
646 if (ibmr)
647 rds_iw_free_mr(ibmr, 0);
648 ibmr = ERR_PTR(ret);
649 }
650 return ibmr;
651}
652
653/*
654 * iWARP fastreg handling
655 *
656 * The life cycle of a fastreg registration is a bit different from
657 * FMRs.
658 * The idea behind fastreg is to have one MR, to which we bind different
659 * mappings over time. To avoid stalling on the expensive map and invalidate
660 * operations, these operations are pipelined on the same send queue on
661 * which we want to send the message containing the r_key.
662 *
663 * This creates a bit of a problem for us, as we do not have the destination
664 * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
665 * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
666 * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
667 * before queuing the SEND. When completions for these arrive, they are
668 * dispatched to the MR has a bit set showing that RDMa can be performed.
669 *
670 * There is another interesting aspect that's related to invalidation.
671 * The application can request that a mapping is invalidated in FREE_MR.
672 * The expectation there is that this invalidation step includes ALL
673 * PREVIOUSLY FREED MRs.
674 */
675static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
676 struct rds_iw_mr *ibmr)
677{
678 struct rds_iw_device *rds_iwdev = pool->device;
679 struct ib_fast_reg_page_list *page_list = NULL;
680 struct ib_mr *mr;
681 int err;
682
683 mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
684 if (IS_ERR(mr)) {
685 err = PTR_ERR(mr);
686
687 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
688 return err;
689 }
690
691 /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
692 * is not filled in.
693 */
694 page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
695 if (IS_ERR(page_list)) {
696 err = PTR_ERR(page_list);
697
698 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
699 ib_dereg_mr(mr);
700 return err;
701 }
702
703 ibmr->page_list = page_list;
704 ibmr->mr = mr;
705 return 0;
706}
707
708static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
709{
710 struct rds_iw_mr *ibmr = mapping->m_mr;
711 struct ib_send_wr f_wr, *failed_wr;
712 int ret;
713
714 /*
715 * Perform a WR for the fast_reg_mr. Each individual page
716 * in the sg list is added to the fast reg page list and placed
717 * inside the fast_reg_mr WR. The key used is a rolling 8bit
718 * counter, which should guarantee uniqueness.
719 */
720 ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
721 mapping->m_rkey = ibmr->mr->rkey;
722
723 memset(&f_wr, 0, sizeof(f_wr));
724 f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
725 f_wr.opcode = IB_WR_FAST_REG_MR;
726 f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
727 f_wr.wr.fast_reg.rkey = mapping->m_rkey;
728 f_wr.wr.fast_reg.page_list = ibmr->page_list;
729 f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
730 f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
731 f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
732 IB_ACCESS_REMOTE_READ |
733 IB_ACCESS_REMOTE_WRITE;
734 f_wr.wr.fast_reg.iova_start = 0;
735 f_wr.send_flags = IB_SEND_SIGNALED;
736
737 failed_wr = &f_wr;
738 ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
739 BUG_ON(failed_wr != &f_wr);
740 if (ret && printk_ratelimit())
741 printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
742 __func__, __LINE__, ret);
743 return ret;
744}
745
746static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
747{
748 struct ib_send_wr s_wr, *failed_wr;
749 int ret = 0;
750
751 if (!ibmr->cm_id->qp || !ibmr->mr)
752 goto out;
753
754 memset(&s_wr, 0, sizeof(s_wr));
755 s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
756 s_wr.opcode = IB_WR_LOCAL_INV;
757 s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
758 s_wr.send_flags = IB_SEND_SIGNALED;
759
760 failed_wr = &s_wr;
761 ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
762 if (ret && printk_ratelimit()) {
763 printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
764 __func__, __LINE__, ret);
765 goto out;
766 }
767out:
768 return ret;
769}
770
771static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
772 struct rds_iw_mr *ibmr,
773 struct scatterlist *sg,
774 unsigned int sg_len)
775{
776 struct rds_iw_device *rds_iwdev = pool->device;
777 struct rds_iw_mapping *mapping = &ibmr->mapping;
778 u64 *dma_pages;
779 int i, ret = 0;
780
781 rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
782
783 dma_pages = rds_iw_map_scatterlist(rds_iwdev,
784 &mapping->m_sg,
785 rds_iwdev->page_shift);
786 if (IS_ERR(dma_pages)) {
787 ret = PTR_ERR(dma_pages);
788 dma_pages = NULL;
789 goto out;
790 }
791
792 if (mapping->m_sg.dma_len > pool->max_message_size) {
793 ret = -EMSGSIZE;
794 goto out;
795 }
796
797 for (i = 0; i < mapping->m_sg.dma_npages; ++i)
798 ibmr->page_list->page_list[i] = dma_pages[i];
799
800 ret = rds_iw_rdma_build_fastreg(mapping);
801 if (ret)
802 goto out;
803
804 rds_iw_stats_inc(s_iw_rdma_mr_used);
805
806out:
807 kfree(dma_pages);
808
809 return ret;
810}
811
812/*
813 * "Free" a fastreg MR.
814 */
815static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
816 struct rds_iw_mr *ibmr)
817{
818 unsigned long flags;
819 int ret;
820
821 if (!ibmr->mapping.m_sg.dma_len)
822 return;
823
824 ret = rds_iw_rdma_fastreg_inv(ibmr);
825 if (ret)
826 return;
827
828 /* Try to post the LOCAL_INV WR to the queue. */
829 spin_lock_irqsave(&pool->list_lock, flags);
830
831 list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
832 atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
833 atomic_inc(&pool->dirty_count);
834
835 spin_unlock_irqrestore(&pool->list_lock, flags);
836}
837
838static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
839 struct list_head *unmap_list,
840 struct list_head *kill_list)
841{
842 struct rds_iw_mapping *mapping, *next;
843 unsigned int ncleaned = 0;
844 LIST_HEAD(laundered);
845
846 /* Batched invalidation of fastreg MRs.
847 * Why do we do it this way, even though we could pipeline unmap
848 * and remap? The reason is the application semantics - when the
849 * application requests an invalidation of MRs, it expects all
850 * previously released R_Keys to become invalid.
851 *
852 * If we implement MR reuse naively, we risk memory corruption
853 * (this has actually been observed). So the default behavior
854 * requires that a MR goes through an explicit unmap operation before
855 * we can reuse it again.
856 *
857 * We could probably improve on this a little, by allowing immediate
858 * reuse of a MR on the same socket (eg you could add small
859 * cache of unused MRs to strct rds_socket - GET_MR could grab one
860 * of these without requiring an explicit invalidate).
861 */
862 while (!list_empty(unmap_list)) {
863 unsigned long flags;
864
865 spin_lock_irqsave(&pool->list_lock, flags);
866 list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
867 list_move(&mapping->m_list, &laundered);
868 ncleaned++;
869 }
870 spin_unlock_irqrestore(&pool->list_lock, flags);
871 }
872
873 /* Move all laundered mappings back to the unmap list.
874 * We do not kill any WRs right now - it doesn't seem the
875 * fastreg API has a max_remap limit. */
876 list_splice_init(&laundered, unmap_list);
877
878 return ncleaned;
879}
880
881static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
882 struct rds_iw_mr *ibmr)
883{
884 if (ibmr->page_list)
885 ib_free_fast_reg_page_list(ibmr->page_list);
886 if (ibmr->mr)
887 ib_dereg_mr(ibmr->mr);
888}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 000000000000..a1931f0027a2
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,869 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/pci.h>
35#include <linux/dma-mapping.h>
36#include <rdma/rdma_cm.h>
37
38#include "rds.h"
39#include "iw.h"
40
41static struct kmem_cache *rds_iw_incoming_slab;
42static struct kmem_cache *rds_iw_frag_slab;
43static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
44
45static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
46{
47 rdsdebug("frag %p page %p\n", frag, frag->f_page);
48 __free_page(frag->f_page);
49 frag->f_page = NULL;
50}
51
52static void rds_iw_frag_free(struct rds_page_frag *frag)
53{
54 rdsdebug("frag %p page %p\n", frag, frag->f_page);
55 BUG_ON(frag->f_page != NULL);
56 kmem_cache_free(rds_iw_frag_slab, frag);
57}
58
59/*
60 * We map a page at a time. Its fragments are posted in order. This
61 * is called in fragment order as the fragments get send completion events.
62 * Only the last frag in the page performs the unmapping.
63 *
64 * It's OK for ring cleanup to call this in whatever order it likes because
65 * DMA is not in flight and so we can unmap while other ring entries still
66 * hold page references in their frags.
67 */
68static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
69 struct rds_iw_recv_work *recv)
70{
71 struct rds_page_frag *frag = recv->r_frag;
72
73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
74 if (frag->f_mapped)
75 ib_dma_unmap_page(ic->i_cm_id->device,
76 frag->f_mapped,
77 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
78 frag->f_mapped = 0;
79}
80
81void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
82{
83 struct rds_iw_recv_work *recv;
84 u32 i;
85
86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
87 struct ib_sge *sge;
88
89 recv->r_iwinc = NULL;
90 recv->r_frag = NULL;
91
92 recv->r_wr.next = NULL;
93 recv->r_wr.wr_id = i;
94 recv->r_wr.sg_list = recv->r_sge;
95 recv->r_wr.num_sge = RDS_IW_RECV_SGE;
96
97 sge = rds_iw_data_sge(ic, recv->r_sge);
98 sge->addr = 0;
99 sge->length = RDS_FRAG_SIZE;
100 sge->lkey = 0;
101
102 sge = rds_iw_header_sge(ic, recv->r_sge);
103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
104 sge->length = sizeof(struct rds_header);
105 sge->lkey = 0;
106 }
107}
108
109static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
110 struct rds_iw_recv_work *recv)
111{
112 if (recv->r_iwinc) {
113 rds_inc_put(&recv->r_iwinc->ii_inc);
114 recv->r_iwinc = NULL;
115 }
116 if (recv->r_frag) {
117 rds_iw_recv_unmap_page(ic, recv);
118 if (recv->r_frag->f_page)
119 rds_iw_frag_drop_page(recv->r_frag);
120 rds_iw_frag_free(recv->r_frag);
121 recv->r_frag = NULL;
122 }
123}
124
125void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
126{
127 u32 i;
128
129 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
130 rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
131
132 if (ic->i_frag.f_page)
133 rds_iw_frag_drop_page(&ic->i_frag);
134}
135
136static int rds_iw_recv_refill_one(struct rds_connection *conn,
137 struct rds_iw_recv_work *recv,
138 gfp_t kptr_gfp, gfp_t page_gfp)
139{
140 struct rds_iw_connection *ic = conn->c_transport_data;
141 dma_addr_t dma_addr;
142 struct ib_sge *sge;
143 int ret = -ENOMEM;
144
145 if (recv->r_iwinc == NULL) {
146 if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
147 rds_iw_stats_inc(s_iw_rx_alloc_limit);
148 goto out;
149 }
150 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
151 kptr_gfp);
152 if (recv->r_iwinc == NULL)
153 goto out;
154 atomic_inc(&rds_iw_allocation);
155 INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
156 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
157 }
158
159 if (recv->r_frag == NULL) {
160 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
161 if (recv->r_frag == NULL)
162 goto out;
163 INIT_LIST_HEAD(&recv->r_frag->f_item);
164 recv->r_frag->f_page = NULL;
165 }
166
167 if (ic->i_frag.f_page == NULL) {
168 ic->i_frag.f_page = alloc_page(page_gfp);
169 if (ic->i_frag.f_page == NULL)
170 goto out;
171 ic->i_frag.f_offset = 0;
172 }
173
174 dma_addr = ib_dma_map_page(ic->i_cm_id->device,
175 ic->i_frag.f_page,
176 ic->i_frag.f_offset,
177 RDS_FRAG_SIZE,
178 DMA_FROM_DEVICE);
179 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
180 goto out;
181
182 /*
183 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
184 * must be called on this recv. This happens as completions hit
185 * in order or on connection shutdown.
186 */
187 recv->r_frag->f_page = ic->i_frag.f_page;
188 recv->r_frag->f_offset = ic->i_frag.f_offset;
189 recv->r_frag->f_mapped = dma_addr;
190
191 sge = rds_iw_data_sge(ic, recv->r_sge);
192 sge->addr = dma_addr;
193 sge->length = RDS_FRAG_SIZE;
194
195 sge = rds_iw_header_sge(ic, recv->r_sge);
196 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
197 sge->length = sizeof(struct rds_header);
198
199 get_page(recv->r_frag->f_page);
200
201 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
202 ic->i_frag.f_offset += RDS_FRAG_SIZE;
203 } else {
204 put_page(ic->i_frag.f_page);
205 ic->i_frag.f_page = NULL;
206 ic->i_frag.f_offset = 0;
207 }
208
209 ret = 0;
210out:
211 return ret;
212}
213
214/*
215 * This tries to allocate and post unused work requests after making sure that
216 * they have all the allocations they need to queue received fragments into
217 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
218 * pairs don't go unmatched.
219 *
220 * -1 is returned if posting fails due to temporary resource exhaustion.
221 */
222int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
223 gfp_t page_gfp, int prefill)
224{
225 struct rds_iw_connection *ic = conn->c_transport_data;
226 struct rds_iw_recv_work *recv;
227 struct ib_recv_wr *failed_wr;
228 unsigned int posted = 0;
229 int ret = 0;
230 u32 pos;
231
232 while ((prefill || rds_conn_up(conn))
233 && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
234 if (pos >= ic->i_recv_ring.w_nr) {
235 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
236 pos);
237 ret = -EINVAL;
238 break;
239 }
240
241 recv = &ic->i_recvs[pos];
242 ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
243 if (ret) {
244 ret = -1;
245 break;
246 }
247
248 /* XXX when can this fail? */
249 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
250 rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
251 recv->r_iwinc, recv->r_frag->f_page,
252 (long) recv->r_frag->f_mapped, ret);
253 if (ret) {
254 rds_iw_conn_error(conn, "recv post on "
255 "%pI4 returned %d, disconnecting and "
256 "reconnecting\n", &conn->c_faddr,
257 ret);
258 ret = -1;
259 break;
260 }
261
262 posted++;
263 }
264
265 /* We're doing flow control - update the window. */
266 if (ic->i_flowctl && posted)
267 rds_iw_advertise_credits(conn, posted);
268
269 if (ret)
270 rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
271 return ret;
272}
273
274void rds_iw_inc_purge(struct rds_incoming *inc)
275{
276 struct rds_iw_incoming *iwinc;
277 struct rds_page_frag *frag;
278 struct rds_page_frag *pos;
279
280 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
281 rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
282
283 list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
284 list_del_init(&frag->f_item);
285 rds_iw_frag_drop_page(frag);
286 rds_iw_frag_free(frag);
287 }
288}
289
290void rds_iw_inc_free(struct rds_incoming *inc)
291{
292 struct rds_iw_incoming *iwinc;
293
294 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
295
296 rds_iw_inc_purge(inc);
297 rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
298 BUG_ON(!list_empty(&iwinc->ii_frags));
299 kmem_cache_free(rds_iw_incoming_slab, iwinc);
300 atomic_dec(&rds_iw_allocation);
301 BUG_ON(atomic_read(&rds_iw_allocation) < 0);
302}
303
304int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
305 size_t size)
306{
307 struct rds_iw_incoming *iwinc;
308 struct rds_page_frag *frag;
309 struct iovec *iov = first_iov;
310 unsigned long to_copy;
311 unsigned long frag_off = 0;
312 unsigned long iov_off = 0;
313 int copied = 0;
314 int ret;
315 u32 len;
316
317 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
318 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
319 len = be32_to_cpu(inc->i_hdr.h_len);
320
321 while (copied < size && copied < len) {
322 if (frag_off == RDS_FRAG_SIZE) {
323 frag = list_entry(frag->f_item.next,
324 struct rds_page_frag, f_item);
325 frag_off = 0;
326 }
327 while (iov_off == iov->iov_len) {
328 iov_off = 0;
329 iov++;
330 }
331
332 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
333 to_copy = min_t(size_t, to_copy, size - copied);
334 to_copy = min_t(unsigned long, to_copy, len - copied);
335
336 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
337 "[%p, %lu] + %lu\n",
338 to_copy, iov->iov_base, iov->iov_len, iov_off,
339 frag->f_page, frag->f_offset, frag_off);
340
341 /* XXX needs + offset for multiple recvs per page */
342 ret = rds_page_copy_to_user(frag->f_page,
343 frag->f_offset + frag_off,
344 iov->iov_base + iov_off,
345 to_copy);
346 if (ret) {
347 copied = ret;
348 break;
349 }
350
351 iov_off += to_copy;
352 frag_off += to_copy;
353 copied += to_copy;
354 }
355
356 return copied;
357}
358
359/* ic starts out kzalloc()ed */
360void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
361{
362 struct ib_send_wr *wr = &ic->i_ack_wr;
363 struct ib_sge *sge = &ic->i_ack_sge;
364
365 sge->addr = ic->i_ack_dma;
366 sge->length = sizeof(struct rds_header);
367 sge->lkey = rds_iw_local_dma_lkey(ic);
368
369 wr->sg_list = sge;
370 wr->num_sge = 1;
371 wr->opcode = IB_WR_SEND;
372 wr->wr_id = RDS_IW_ACK_WR_ID;
373 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
374}
375
376/*
377 * You'd think that with reliable IB connections you wouldn't need to ack
378 * messages that have been received. The problem is that IB hardware generates
379 * an ack message before it has DMAed the message into memory. This creates a
380 * potential message loss if the HCA is disabled for any reason between when it
381 * sends the ack and before the message is DMAed and processed. This is only a
382 * potential issue if another HCA is available for fail-over.
383 *
384 * When the remote host receives our ack they'll free the sent message from
385 * their send queue. To decrease the latency of this we always send an ack
386 * immediately after we've received messages.
387 *
388 * For simplicity, we only have one ack in flight at a time. This puts
389 * pressure on senders to have deep enough send queues to absorb the latency of
390 * a single ack frame being in flight. This might not be good enough.
391 *
392 * This is implemented by have a long-lived send_wr and sge which point to a
393 * statically allocated ack frame. This ack wr does not fall under the ring
394 * accounting that the tx and rx wrs do. The QP attribute specifically makes
395 * room for it beyond the ring size. Send completion notices its special
396 * wr_id and avoids working with the ring in that case.
397 */
398static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
399 int ack_required)
400{
401 rds_iw_set_64bit(&ic->i_ack_next, seq);
402 if (ack_required) {
403 smp_mb__before_clear_bit();
404 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
405 }
406}
407
408static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
409{
410 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
411 smp_mb__after_clear_bit();
412
413 return ic->i_ack_next;
414}
415
416static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
417{
418 struct rds_header *hdr = ic->i_ack;
419 struct ib_send_wr *failed_wr;
420 u64 seq;
421 int ret;
422
423 seq = rds_iw_get_ack(ic);
424
425 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
426 rds_message_populate_header(hdr, 0, 0, 0);
427 hdr->h_ack = cpu_to_be64(seq);
428 hdr->h_credit = adv_credits;
429 rds_message_make_checksum(hdr);
430 ic->i_ack_queued = jiffies;
431
432 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
433 if (unlikely(ret)) {
434 /* Failed to send. Release the WR, and
435 * force another ACK.
436 */
437 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
438 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
439
440 rds_iw_stats_inc(s_iw_ack_send_failure);
441 /* Need to finesse this later. */
442 BUG();
443 } else
444 rds_iw_stats_inc(s_iw_ack_sent);
445}
446
447/*
448 * There are 3 ways of getting acknowledgements to the peer:
449 * 1. We call rds_iw_attempt_ack from the recv completion handler
450 * to send an ACK-only frame.
451 * However, there can be only one such frame in the send queue
452 * at any time, so we may have to postpone it.
453 * 2. When another (data) packet is transmitted while there's
454 * an ACK in the queue, we piggyback the ACK sequence number
455 * on the data packet.
456 * 3. If the ACK WR is done sending, we get called from the
457 * send queue completion handler, and check whether there's
458 * another ACK pending (postponed because the WR was on the
459 * queue). If so, we transmit it.
460 *
461 * We maintain 2 variables:
462 * - i_ack_flags, which keeps track of whether the ACK WR
463 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
464 * - i_ack_next, which is the last sequence number we received
465 *
466 * Potentially, send queue and receive queue handlers can run concurrently.
467 *
468 * Reconnecting complicates this picture just slightly. When we
469 * reconnect, we may be seeing duplicate packets. The peer
470 * is retransmitting them, because it hasn't seen an ACK for
471 * them. It is important that we ACK these.
472 *
473 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
474 * this flag set *MUST* be acknowledged immediately.
475 */
476
477/*
478 * When we get here, we're called from the recv queue handler.
479 * Check whether we ought to transmit an ACK.
480 */
481void rds_iw_attempt_ack(struct rds_iw_connection *ic)
482{
483 unsigned int adv_credits;
484
485 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
486 return;
487
488 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
489 rds_iw_stats_inc(s_iw_ack_send_delayed);
490 return;
491 }
492
493 /* Can we get a send credit? */
494 if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
495 rds_iw_stats_inc(s_iw_tx_throttle);
496 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
497 return;
498 }
499
500 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
501 rds_iw_send_ack(ic, adv_credits);
502}
503
504/*
505 * We get here from the send completion handler, when the
506 * adapter tells us the ACK frame was sent.
507 */
508void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
509{
510 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
511 rds_iw_attempt_ack(ic);
512}
513
514/*
515 * This is called by the regular xmit code when it wants to piggyback
516 * an ACK on an outgoing frame.
517 */
518u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
519{
520 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
521 rds_iw_stats_inc(s_iw_ack_send_piggybacked);
522 return rds_iw_get_ack(ic);
523}
524
525/*
526 * It's kind of lame that we're copying from the posted receive pages into
527 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
528 * them. But receiving new congestion bitmaps should be a *rare* event, so
529 * hopefully we won't need to invest that complexity in making it more
530 * efficient. By copying we can share a simpler core with TCP which has to
531 * copy.
532 */
533static void rds_iw_cong_recv(struct rds_connection *conn,
534 struct rds_iw_incoming *iwinc)
535{
536 struct rds_cong_map *map;
537 unsigned int map_off;
538 unsigned int map_page;
539 struct rds_page_frag *frag;
540 unsigned long frag_off;
541 unsigned long to_copy;
542 unsigned long copied;
543 uint64_t uncongested = 0;
544 void *addr;
545
546 /* catch completely corrupt packets */
547 if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
548 return;
549
550 map = conn->c_fcong;
551 map_page = 0;
552 map_off = 0;
553
554 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
555 frag_off = 0;
556
557 copied = 0;
558
559 while (copied < RDS_CONG_MAP_BYTES) {
560 uint64_t *src, *dst;
561 unsigned int k;
562
563 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
564 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
565
566 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
567
568 src = addr + frag_off;
569 dst = (void *)map->m_page_addrs[map_page] + map_off;
570 for (k = 0; k < to_copy; k += 8) {
571 /* Record ports that became uncongested, ie
572 * bits that changed from 0 to 1. */
573 uncongested |= ~(*src) & *dst;
574 *dst++ = *src++;
575 }
576 kunmap_atomic(addr, KM_SOFTIRQ0);
577
578 copied += to_copy;
579
580 map_off += to_copy;
581 if (map_off == PAGE_SIZE) {
582 map_off = 0;
583 map_page++;
584 }
585
586 frag_off += to_copy;
587 if (frag_off == RDS_FRAG_SIZE) {
588 frag = list_entry(frag->f_item.next,
589 struct rds_page_frag, f_item);
590 frag_off = 0;
591 }
592 }
593
594 /* the congestion map is in little endian order */
595 uncongested = le64_to_cpu(uncongested);
596
597 rds_cong_map_updated(map, uncongested);
598}
599
600/*
601 * Rings are posted with all the allocations they'll need to queue the
602 * incoming message to the receiving socket so this can't fail.
603 * All fragments start with a header, so we can make sure we're not receiving
604 * garbage, and we can tell a small 8 byte fragment from an ACK frame.
605 */
606struct rds_iw_ack_state {
607 u64 ack_next;
608 u64 ack_recv;
609 unsigned int ack_required:1;
610 unsigned int ack_next_valid:1;
611 unsigned int ack_recv_valid:1;
612};
613
614static void rds_iw_process_recv(struct rds_connection *conn,
615 struct rds_iw_recv_work *recv, u32 byte_len,
616 struct rds_iw_ack_state *state)
617{
618 struct rds_iw_connection *ic = conn->c_transport_data;
619 struct rds_iw_incoming *iwinc = ic->i_iwinc;
620 struct rds_header *ihdr, *hdr;
621
622 /* XXX shut down the connection if port 0,0 are seen? */
623
624 rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
625 byte_len);
626
627 if (byte_len < sizeof(struct rds_header)) {
628 rds_iw_conn_error(conn, "incoming message "
629 "from %pI4 didn't inclue a "
630 "header, disconnecting and "
631 "reconnecting\n",
632 &conn->c_faddr);
633 return;
634 }
635 byte_len -= sizeof(struct rds_header);
636
637 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
638
639 /* Validate the checksum. */
640 if (!rds_message_verify_checksum(ihdr)) {
641 rds_iw_conn_error(conn, "incoming message "
642 "from %pI4 has corrupted header - "
643 "forcing a reconnect\n",
644 &conn->c_faddr);
645 rds_stats_inc(s_recv_drop_bad_checksum);
646 return;
647 }
648
649 /* Process the ACK sequence which comes with every packet */
650 state->ack_recv = be64_to_cpu(ihdr->h_ack);
651 state->ack_recv_valid = 1;
652
653 /* Process the credits update if there was one */
654 if (ihdr->h_credit)
655 rds_iw_send_add_credits(conn, ihdr->h_credit);
656
657 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
658 /* This is an ACK-only packet. The fact that it gets
659 * special treatment here is that historically, ACKs
660 * were rather special beasts.
661 */
662 rds_iw_stats_inc(s_iw_ack_received);
663
664 /*
665 * Usually the frags make their way on to incs and are then freed as
666 * the inc is freed. We don't go that route, so we have to drop the
667 * page ref ourselves. We can't just leave the page on the recv
668 * because that confuses the dma mapping of pages and each recv's use
669 * of a partial page. We can leave the frag, though, it will be
670 * reused.
671 *
672 * FIXME: Fold this into the code path below.
673 */
674 rds_iw_frag_drop_page(recv->r_frag);
675 return;
676 }
677
678 /*
679 * If we don't already have an inc on the connection then this
680 * fragment has a header and starts a message.. copy its header
681 * into the inc and save the inc so we can hang upcoming fragments
682 * off its list.
683 */
684 if (iwinc == NULL) {
685 iwinc = recv->r_iwinc;
686 recv->r_iwinc = NULL;
687 ic->i_iwinc = iwinc;
688
689 hdr = &iwinc->ii_inc.i_hdr;
690 memcpy(hdr, ihdr, sizeof(*hdr));
691 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
692
693 rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
694 ic->i_recv_data_rem, hdr->h_flags);
695 } else {
696 hdr = &iwinc->ii_inc.i_hdr;
697 /* We can't just use memcmp here; fragments of a
698 * single message may carry different ACKs */
699 if (hdr->h_sequence != ihdr->h_sequence
700 || hdr->h_len != ihdr->h_len
701 || hdr->h_sport != ihdr->h_sport
702 || hdr->h_dport != ihdr->h_dport) {
703 rds_iw_conn_error(conn,
704 "fragment header mismatch; forcing reconnect\n");
705 return;
706 }
707 }
708
709 list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
710 recv->r_frag = NULL;
711
712 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
713 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
714 else {
715 ic->i_recv_data_rem = 0;
716 ic->i_iwinc = NULL;
717
718 if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
719 rds_iw_cong_recv(conn, iwinc);
720 else {
721 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
722 &iwinc->ii_inc, GFP_ATOMIC,
723 KM_SOFTIRQ0);
724 state->ack_next = be64_to_cpu(hdr->h_sequence);
725 state->ack_next_valid = 1;
726 }
727
728 /* Evaluate the ACK_REQUIRED flag *after* we received
729 * the complete frame, and after bumping the next_rx
730 * sequence. */
731 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
732 rds_stats_inc(s_recv_ack_required);
733 state->ack_required = 1;
734 }
735
736 rds_inc_put(&iwinc->ii_inc);
737 }
738}
739
740/*
741 * Plucking the oldest entry from the ring can be done concurrently with
742 * the thread refilling the ring. Each ring operation is protected by
743 * spinlocks and the transient state of refilling doesn't change the
744 * recording of which entry is oldest.
745 *
746 * This relies on IB only calling one cq comp_handler for each cq so that
747 * there will only be one caller of rds_recv_incoming() per RDS connection.
748 */
749void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
750{
751 struct rds_connection *conn = context;
752 struct rds_iw_connection *ic = conn->c_transport_data;
753 struct ib_wc wc;
754 struct rds_iw_ack_state state = { 0, };
755 struct rds_iw_recv_work *recv;
756
757 rdsdebug("conn %p cq %p\n", conn, cq);
758
759 rds_iw_stats_inc(s_iw_rx_cq_call);
760
761 ib_req_notify_cq(cq, IB_CQ_SOLICITED);
762
763 while (ib_poll_cq(cq, 1, &wc) > 0) {
764 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
765 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
766 be32_to_cpu(wc.ex.imm_data));
767 rds_iw_stats_inc(s_iw_rx_cq_event);
768
769 recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
770
771 rds_iw_recv_unmap_page(ic, recv);
772
773 /*
774 * Also process recvs in connecting state because it is possible
775 * to get a recv completion _before_ the rdmacm ESTABLISHED
776 * event is processed.
777 */
778 if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
779 /* We expect errors as the qp is drained during shutdown */
780 if (wc.status == IB_WC_SUCCESS) {
781 rds_iw_process_recv(conn, recv, wc.byte_len, &state);
782 } else {
783 rds_iw_conn_error(conn, "recv completion on "
784 "%pI4 had status %u, disconnecting and "
785 "reconnecting\n", &conn->c_faddr,
786 wc.status);
787 }
788 }
789
790 rds_iw_ring_free(&ic->i_recv_ring, 1);
791 }
792
793 if (state.ack_next_valid)
794 rds_iw_set_ack(ic, state.ack_next, state.ack_required);
795 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
796 rds_send_drop_acked(conn, state.ack_recv, NULL);
797 ic->i_ack_recv = state.ack_recv;
798 }
799 if (rds_conn_up(conn))
800 rds_iw_attempt_ack(ic);
801
802 /* If we ever end up with a really empty receive ring, we're
803 * in deep trouble, as the sender will definitely see RNR
804 * timeouts. */
805 if (rds_iw_ring_empty(&ic->i_recv_ring))
806 rds_iw_stats_inc(s_iw_rx_ring_empty);
807
808 /*
809 * If the ring is running low, then schedule the thread to refill.
810 */
811 if (rds_iw_ring_low(&ic->i_recv_ring))
812 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
813}
814
815int rds_iw_recv(struct rds_connection *conn)
816{
817 struct rds_iw_connection *ic = conn->c_transport_data;
818 int ret = 0;
819
820 rdsdebug("conn %p\n", conn);
821
822 /*
823 * If we get a temporary posting failure in this context then
824 * we're really low and we want the caller to back off for a bit.
825 */
826 mutex_lock(&ic->i_recv_mutex);
827 if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
828 ret = -ENOMEM;
829 else
830 rds_iw_stats_inc(s_iw_rx_refill_from_thread);
831 mutex_unlock(&ic->i_recv_mutex);
832
833 if (rds_conn_up(conn))
834 rds_iw_attempt_ack(ic);
835
836 return ret;
837}
838
839int __init rds_iw_recv_init(void)
840{
841 struct sysinfo si;
842 int ret = -ENOMEM;
843
844 /* Default to 30% of all available RAM for recv memory */
845 si_meminfo(&si);
846 rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
847
848 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
849 sizeof(struct rds_iw_incoming),
850 0, 0, NULL);
851 if (rds_iw_incoming_slab == NULL)
852 goto out;
853
854 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
855 sizeof(struct rds_page_frag),
856 0, 0, NULL);
857 if (rds_iw_frag_slab == NULL)
858 kmem_cache_destroy(rds_iw_incoming_slab);
859 else
860 ret = 0;
861out:
862 return ret;
863}
864
865void rds_iw_recv_exit(void)
866{
867 kmem_cache_destroy(rds_iw_incoming_slab);
868 kmem_cache_destroy(rds_iw_frag_slab);
869}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 000000000000..d422d4b5deef
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "iw.h"
37
38/*
39 * Locking for IB rings.
40 * We assume that allocation is always protected by a mutex
41 * in the caller (this is a valid assumption for the current
42 * implementation).
43 *
44 * Freeing always happens in an interrupt, and hence only
45 * races with allocations, but not with other free()s.
46 *
47 * The interaction between allocation and freeing is that
48 * the alloc code has to determine the number of free entries.
49 * To this end, we maintain two counters; an allocation counter
50 * and a free counter. Both are allowed to run freely, and wrap
51 * around.
52 * The number of used entries is always (alloc_ctr - free_ctr) % NR.
53 *
54 * The current implementation makes free_ctr atomic. When the
55 * caller finds an allocation fails, it should set an "alloc fail"
56 * bit and retry the allocation. The "alloc fail" bit essentially tells
57 * the CQ completion handlers to wake it up after freeing some
58 * more entries.
59 */
60
61/*
62 * This only happens on shutdown.
63 */
64DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
65
66void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
67{
68 memset(ring, 0, sizeof(*ring));
69 ring->w_nr = nr;
70 rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
71}
72
73static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
74{
75 u32 diff;
76
77 /* This assumes that atomic_t has at least as many bits as u32 */
78 diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
79 BUG_ON(diff > ring->w_nr);
80
81 return diff;
82}
83
84void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
85{
86 /* We only ever get called from the connection setup code,
87 * prior to creating the QP. */
88 BUG_ON(__rds_iw_ring_used(ring));
89 ring->w_nr = nr;
90}
91
92static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
93{
94 return __rds_iw_ring_used(ring) == 0;
95}
96
97u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
98{
99 u32 ret = 0, avail;
100
101 avail = ring->w_nr - __rds_iw_ring_used(ring);
102
103 rdsdebug("ring %p val %u next %u free %u\n", ring, val,
104 ring->w_alloc_ptr, avail);
105
106 if (val && avail) {
107 ret = min(val, avail);
108 *pos = ring->w_alloc_ptr;
109
110 ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
111 ring->w_alloc_ctr += ret;
112 }
113
114 return ret;
115}
116
117void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
118{
119 ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
120 atomic_add(val, &ring->w_free_ctr);
121
122 if (__rds_iw_ring_empty(ring) &&
123 waitqueue_active(&rds_iw_ring_empty_wait))
124 wake_up(&rds_iw_ring_empty_wait);
125}
126
127void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
128{
129 ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
130 ring->w_alloc_ctr -= val;
131}
132
133int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
134{
135 return __rds_iw_ring_empty(ring);
136}
137
138int rds_iw_ring_low(struct rds_iw_work_ring *ring)
139{
140 return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
141}
142
143
144/*
145 * returns the oldest alloced ring entry. This will be the next one
146 * freed. This can't be called if there are none allocated.
147 */
148u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
149{
150 return ring->w_free_ptr;
151}
152
153/*
154 * returns the number of completed work requests.
155 */
156
157u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
158{
159 u32 ret;
160
161 if (oldest <= (unsigned long long)wr_id)
162 ret = (unsigned long long)wr_id - oldest + 1;
163 else
164 ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
165
166 rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
167 wr_id, oldest);
168 return ret;
169}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 000000000000..22dd38ffd608
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,975 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/device.h>
36#include <linux/dmapool.h>
37
38#include "rds.h"
39#include "rdma.h"
40#include "iw.h"
41
42static void rds_iw_send_rdma_complete(struct rds_message *rm,
43 int wc_status)
44{
45 int notify_status;
46
47 switch (wc_status) {
48 case IB_WC_WR_FLUSH_ERR:
49 return;
50
51 case IB_WC_SUCCESS:
52 notify_status = RDS_RDMA_SUCCESS;
53 break;
54
55 case IB_WC_REM_ACCESS_ERR:
56 notify_status = RDS_RDMA_REMOTE_ERROR;
57 break;
58
59 default:
60 notify_status = RDS_RDMA_OTHER_ERROR;
61 break;
62 }
63 rds_rdma_send_complete(rm, notify_status);
64}
65
66static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
67 struct rds_rdma_op *op)
68{
69 if (op->r_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0;
74 }
75}
76
77static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
78 struct rds_iw_send_work *send,
79 int wc_status)
80{
81 struct rds_message *rm = send->s_rm;
82
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
84
85 ib_dma_unmap_sg(ic->i_cm_id->device,
86 rm->m_sg, rm->m_nents,
87 DMA_TO_DEVICE);
88
89 if (rm->m_rdma_op != NULL) {
90 rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
91
92 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics:
94 * 1. Notify when we received the ACK on the RDS message
95 * that was queued with the RDMA. This provides reliable
96 * notification of RDMA status at the expense of a one-way
97 * packet delay.
98 * 2. Notify when the IB stack gives us the completion event for
99 * the RDMA operation.
100 * 3. Notify when the IB stack gives us the completion event for
101 * the accompanying RDS messages.
102 * Here, we implement approach #3. To implement approach #2,
103 * call rds_rdma_send_complete from the cq_handler. To implement #1,
104 * don't call rds_rdma_send_complete at all, and fall back to the notify
105 * handling in the ACK processing code.
106 *
107 * Note: There's no need to explicitly sync any RDMA buffers using
108 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
109 * operation itself unmapped the RDMA buffers, which takes care
110 * of synching.
111 */
112 rds_iw_send_rdma_complete(rm, wc_status);
113
114 if (rm->m_rdma_op->r_write)
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
116 else
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
118 }
119
120 /* If anyone waited for this message to get flushed out, wake
121 * them up now */
122 rds_message_unmapped(rm);
123
124 rds_message_put(rm);
125 send->s_rm = NULL;
126}
127
128void rds_iw_send_init_ring(struct rds_iw_connection *ic)
129{
130 struct rds_iw_send_work *send;
131 u32 i;
132
133 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
134 struct ib_sge *sge;
135
136 send->s_rm = NULL;
137 send->s_op = NULL;
138 send->s_mapping = NULL;
139
140 send->s_wr.next = NULL;
141 send->s_wr.wr_id = i;
142 send->s_wr.sg_list = send->s_sge;
143 send->s_wr.num_sge = 1;
144 send->s_wr.opcode = IB_WR_SEND;
145 send->s_wr.send_flags = 0;
146 send->s_wr.ex.imm_data = 0;
147
148 sge = rds_iw_data_sge(ic, send->s_sge);
149 sge->lkey = 0;
150
151 sge = rds_iw_header_sge(ic, send->s_sge);
152 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
153 sge->length = sizeof(struct rds_header);
154 sge->lkey = 0;
155
156 send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
157 if (IS_ERR(send->s_mr)) {
158 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
159 break;
160 }
161
162 send->s_page_list = ib_alloc_fast_reg_page_list(
163 ic->i_cm_id->device, fastreg_message_size);
164 if (IS_ERR(send->s_page_list)) {
165 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
166 break;
167 }
168 }
169}
170
171void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
172{
173 struct rds_iw_send_work *send;
174 u32 i;
175
176 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
177 BUG_ON(!send->s_mr);
178 ib_dereg_mr(send->s_mr);
179 BUG_ON(!send->s_page_list);
180 ib_free_fast_reg_page_list(send->s_page_list);
181 if (send->s_wr.opcode == 0xdead)
182 continue;
183 if (send->s_rm)
184 rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
185 if (send->s_op)
186 rds_iw_send_unmap_rdma(ic, send->s_op);
187 }
188}
189
190/*
191 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
192 * operations performed in the send path. As the sender allocs and potentially
193 * unallocs the next free entry in the ring it doesn't alter which is
194 * the next to be freed, which is what this is concerned with.
195 */
196void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
197{
198 struct rds_connection *conn = context;
199 struct rds_iw_connection *ic = conn->c_transport_data;
200 struct ib_wc wc;
201 struct rds_iw_send_work *send;
202 u32 completed;
203 u32 oldest;
204 u32 i;
205 int ret;
206
207 rdsdebug("cq %p conn %p\n", cq, conn);
208 rds_iw_stats_inc(s_iw_tx_cq_call);
209 ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
210 if (ret)
211 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
212
213 while (ib_poll_cq(cq, 1, &wc) > 0) {
214 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
215 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
216 be32_to_cpu(wc.ex.imm_data));
217 rds_iw_stats_inc(s_iw_tx_cq_event);
218
219 if (wc.status != IB_WC_SUCCESS) {
220 printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode);
221 break;
222 }
223
224 if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
225 ic->i_fastreg_posted = 0;
226 continue;
227 }
228
229 if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
230 ic->i_fastreg_posted = 1;
231 continue;
232 }
233
234 if (wc.wr_id == RDS_IW_ACK_WR_ID) {
235 if (ic->i_ack_queued + HZ/2 < jiffies)
236 rds_iw_stats_inc(s_iw_tx_stalled);
237 rds_iw_ack_send_complete(ic);
238 continue;
239 }
240
241 oldest = rds_iw_ring_oldest(&ic->i_send_ring);
242
243 completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
244
245 for (i = 0; i < completed; i++) {
246 send = &ic->i_sends[oldest];
247
248 /* In the error case, wc.opcode sometimes contains garbage */
249 switch (send->s_wr.opcode) {
250 case IB_WR_SEND:
251 if (send->s_rm)
252 rds_iw_send_unmap_rm(ic, send, wc.status);
253 break;
254 case IB_WR_FAST_REG_MR:
255 case IB_WR_RDMA_WRITE:
256 case IB_WR_RDMA_READ:
257 case IB_WR_RDMA_READ_WITH_INV:
258 /* Nothing to be done - the SG list will be unmapped
259 * when the SEND completes. */
260 break;
261 default:
262 if (printk_ratelimit())
263 printk(KERN_NOTICE
264 "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
265 __func__, send->s_wr.opcode);
266 break;
267 }
268
269 send->s_wr.opcode = 0xdead;
270 send->s_wr.num_sge = 1;
271 if (send->s_queued + HZ/2 < jiffies)
272 rds_iw_stats_inc(s_iw_tx_stalled);
273
274 /* If a RDMA operation produced an error, signal this right
275 * away. If we don't, the subsequent SEND that goes with this
276 * RDMA will be canceled with ERR_WFLUSH, and the application
277 * never learn that the RDMA failed. */
278 if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
279 struct rds_message *rm;
280
281 rm = rds_send_get_message(conn, send->s_op);
282 if (rm)
283 rds_iw_send_rdma_complete(rm, wc.status);
284 }
285
286 oldest = (oldest + 1) % ic->i_send_ring.w_nr;
287 }
288
289 rds_iw_ring_free(&ic->i_send_ring, completed);
290
291 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
292 || test_bit(0, &conn->c_map_queued))
293 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
294
295 /* We expect errors as the qp is drained during shutdown */
296 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
297 rds_iw_conn_error(conn,
298 "send completion on %pI4 "
299 "had status %u, disconnecting and reconnecting\n",
300 &conn->c_faddr, wc.status);
301 }
302 }
303}
304
305/*
306 * This is the main function for allocating credits when sending
307 * messages.
308 *
309 * Conceptually, we have two counters:
310 * - send credits: this tells us how many WRs we're allowed
311 * to submit without overruning the reciever's queue. For
312 * each SEND WR we post, we decrement this by one.
313 *
314 * - posted credits: this tells us how many WRs we recently
315 * posted to the receive queue. This value is transferred
316 * to the peer as a "credit update" in a RDS header field.
317 * Every time we transmit credits to the peer, we subtract
318 * the amount of transferred credits from this counter.
319 *
320 * It is essential that we avoid situations where both sides have
321 * exhausted their send credits, and are unable to send new credits
322 * to the peer. We achieve this by requiring that we send at least
323 * one credit update to the peer before exhausting our credits.
324 * When new credits arrive, we subtract one credit that is withheld
325 * until we've posted new buffers and are ready to transmit these
326 * credits (see rds_iw_send_add_credits below).
327 *
328 * The RDS send code is essentially single-threaded; rds_send_xmit
329 * grabs c_send_lock to ensure exclusive access to the send ring.
330 * However, the ACK sending code is independent and can race with
331 * message SENDs.
332 *
333 * In the send path, we need to update the counters for send credits
334 * and the counter of posted buffers atomically - when we use the
335 * last available credit, we cannot allow another thread to race us
336 * and grab the posted credits counter. Hence, we have to use a
337 * spinlock to protect the credit counter, or use atomics.
338 *
339 * Spinlocks shared between the send and the receive path are bad,
340 * because they create unnecessary delays. An early implementation
341 * using a spinlock showed a 5% degradation in throughput at some
342 * loads.
343 *
344 * This implementation avoids spinlocks completely, putting both
345 * counters into a single atomic, and updating that atomic using
346 * atomic_add (in the receive path, when receiving fresh credits),
347 * and using atomic_cmpxchg when updating the two counters.
348 */
349int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
350 u32 wanted, u32 *adv_credits, int need_posted)
351{
352 unsigned int avail, posted, got = 0, advertise;
353 long oldval, newval;
354
355 *adv_credits = 0;
356 if (!ic->i_flowctl)
357 return wanted;
358
359try_again:
360 advertise = 0;
361 oldval = newval = atomic_read(&ic->i_credits);
362 posted = IB_GET_POST_CREDITS(oldval);
363 avail = IB_GET_SEND_CREDITS(oldval);
364
365 rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
366 wanted, avail, posted);
367
368 /* The last credit must be used to send a credit update. */
369 if (avail && !posted)
370 avail--;
371
372 if (avail < wanted) {
373 struct rds_connection *conn = ic->i_cm_id->context;
374
375 /* Oops, there aren't that many credits left! */
376 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
377 got = avail;
378 } else {
379 /* Sometimes you get what you want, lalala. */
380 got = wanted;
381 }
382 newval -= IB_SET_SEND_CREDITS(got);
383
384 /*
385 * If need_posted is non-zero, then the caller wants
386 * the posted regardless of whether any send credits are
387 * available.
388 */
389 if (posted && (got || need_posted)) {
390 advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
391 newval -= IB_SET_POST_CREDITS(advertise);
392 }
393
394 /* Finally bill everything */
395 if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
396 goto try_again;
397
398 *adv_credits = advertise;
399 return got;
400}
401
402void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
403{
404 struct rds_iw_connection *ic = conn->c_transport_data;
405
406 if (credits == 0)
407 return;
408
409 rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
410 credits,
411 IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
412 test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
413
414 atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
415 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
416 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
417
418 WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
419
420 rds_iw_stats_inc(s_iw_rx_credit_updates);
421}
422
423void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
424{
425 struct rds_iw_connection *ic = conn->c_transport_data;
426
427 if (posted == 0)
428 return;
429
430 atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
431
432 /* Decide whether to send an update to the peer now.
433 * If we would send a credit update for every single buffer we
434 * post, we would end up with an ACK storm (ACK arrives,
435 * consumes buffer, we refill the ring, send ACK to remote
436 * advertising the newly posted buffer... ad inf)
437 *
438 * Performance pretty much depends on how often we send
439 * credit updates - too frequent updates mean lots of ACKs.
440 * Too infrequent updates, and the peer will run out of
441 * credits and has to throttle.
442 * For the time being, 16 seems to be a good compromise.
443 */
444 if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
445 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
446}
447
448static inline void
449rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
450 struct rds_iw_send_work *send, unsigned int pos,
451 unsigned long buffer, unsigned int length,
452 int send_flags)
453{
454 struct ib_sge *sge;
455
456 WARN_ON(pos != send - ic->i_sends);
457
458 send->s_wr.send_flags = send_flags;
459 send->s_wr.opcode = IB_WR_SEND;
460 send->s_wr.num_sge = 2;
461 send->s_wr.next = NULL;
462 send->s_queued = jiffies;
463 send->s_op = NULL;
464
465 if (length != 0) {
466 sge = rds_iw_data_sge(ic, send->s_sge);
467 sge->addr = buffer;
468 sge->length = length;
469 sge->lkey = rds_iw_local_dma_lkey(ic);
470
471 sge = rds_iw_header_sge(ic, send->s_sge);
472 } else {
473 /* We're sending a packet with no payload. There is only
474 * one SGE */
475 send->s_wr.num_sge = 1;
476 sge = &send->s_sge[0];
477 }
478
479 sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
480 sge->length = sizeof(struct rds_header);
481 sge->lkey = rds_iw_local_dma_lkey(ic);
482}
483
484/*
485 * This can be called multiple times for a given message. The first time
486 * we see a message we map its scatterlist into the IB device so that
487 * we can provide that mapped address to the IB scatter gather entries
488 * in the IB work requests. We translate the scatterlist into a series
489 * of work requests that fragment the message. These work requests complete
490 * in order so we pass ownership of the message to the completion handler
491 * once we send the final fragment.
492 *
493 * The RDS core uses the c_send_lock to only enter this function once
494 * per connection. This makes sure that the tx ring alloc/unalloc pairs
495 * don't get out of sync and confuse the ring.
496 */
497int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
498 unsigned int hdr_off, unsigned int sg, unsigned int off)
499{
500 struct rds_iw_connection *ic = conn->c_transport_data;
501 struct ib_device *dev = ic->i_cm_id->device;
502 struct rds_iw_send_work *send = NULL;
503 struct rds_iw_send_work *first;
504 struct rds_iw_send_work *prev;
505 struct ib_send_wr *failed_wr;
506 struct scatterlist *scat;
507 u32 pos;
508 u32 i;
509 u32 work_alloc;
510 u32 credit_alloc;
511 u32 posted;
512 u32 adv_credits = 0;
513 int send_flags = 0;
514 int sent;
515 int ret;
516 int flow_controlled = 0;
517
518 BUG_ON(off % RDS_FRAG_SIZE);
519 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
520
521 /* Fastreg support */
522 if (rds_rdma_cookie_key(rm->m_rdma_cookie)
523 && !ic->i_fastreg_posted) {
524 ret = -EAGAIN;
525 goto out;
526 }
527
528 /* FIXME we may overallocate here */
529 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
530 i = 1;
531 else
532 i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
533
534 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
535 if (work_alloc == 0) {
536 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
537 rds_iw_stats_inc(s_iw_tx_ring_full);
538 ret = -ENOMEM;
539 goto out;
540 }
541
542 credit_alloc = work_alloc;
543 if (ic->i_flowctl) {
544 credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
545 adv_credits += posted;
546 if (credit_alloc < work_alloc) {
547 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
548 work_alloc = credit_alloc;
549 flow_controlled++;
550 }
551 if (work_alloc == 0) {
552 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
553 rds_iw_stats_inc(s_iw_tx_throttle);
554 ret = -ENOMEM;
555 goto out;
556 }
557 }
558
559 /* map the message the first time we see it */
560 if (ic->i_rm == NULL) {
561 /*
562 printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
563 be16_to_cpu(rm->m_inc.i_hdr.h_dport),
564 rm->m_inc.i_hdr.h_flags,
565 be32_to_cpu(rm->m_inc.i_hdr.h_len));
566 */
567 if (rm->m_nents) {
568 rm->m_count = ib_dma_map_sg(dev,
569 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
570 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
571 if (rm->m_count == 0) {
572 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
573 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
574 ret = -ENOMEM; /* XXX ? */
575 goto out;
576 }
577 } else {
578 rm->m_count = 0;
579 }
580
581 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
582 ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
583 rds_message_addref(rm);
584 ic->i_rm = rm;
585
586 /* Finalize the header */
587 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
588 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
589 if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
590 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
591
592 /* If it has a RDMA op, tell the peer we did it. This is
593 * used by the peer to release use-once RDMA MRs. */
594 if (rm->m_rdma_op) {
595 struct rds_ext_header_rdma ext_hdr;
596
597 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
598 rds_message_add_extension(&rm->m_inc.i_hdr,
599 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
600 }
601 if (rm->m_rdma_cookie) {
602 rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
603 rds_rdma_cookie_key(rm->m_rdma_cookie),
604 rds_rdma_cookie_offset(rm->m_rdma_cookie));
605 }
606
607 /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
608 * we should not do this unless we have a chance of at least
609 * sticking the header into the send ring. Which is why we
610 * should call rds_iw_ring_alloc first. */
611 rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
612 rds_message_make_checksum(&rm->m_inc.i_hdr);
613
614 /*
615 * Update adv_credits since we reset the ACK_REQUIRED bit.
616 */
617 rds_iw_send_grab_credits(ic, 0, &posted, 1);
618 adv_credits += posted;
619 BUG_ON(adv_credits > 255);
620 } else if (ic->i_rm != rm)
621 BUG();
622
623 send = &ic->i_sends[pos];
624 first = send;
625 prev = NULL;
626 scat = &rm->m_sg[sg];
627 sent = 0;
628 i = 0;
629
630 /* Sometimes you want to put a fence between an RDMA
631 * READ and the following SEND.
632 * We could either do this all the time
633 * or when requested by the user. Right now, we let
634 * the application choose.
635 */
636 if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
637 send_flags = IB_SEND_FENCE;
638
639 /*
640 * We could be copying the header into the unused tail of the page.
641 * That would need to be changed in the future when those pages might
642 * be mapped userspace pages or page cache pages. So instead we always
643 * use a second sge and our long-lived ring of mapped headers. We send
644 * the header after the data so that the data payload can be aligned on
645 * the receiver.
646 */
647
648 /* handle a 0-len message */
649 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
650 rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
651 goto add_header;
652 }
653
654 /* if there's data reference it with a chain of work reqs */
655 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
656 unsigned int len;
657
658 send = &ic->i_sends[pos];
659
660 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
661 rds_iw_xmit_populate_wr(ic, send, pos,
662 ib_sg_dma_address(dev, scat) + off, len,
663 send_flags);
664
665 /*
666 * We want to delay signaling completions just enough to get
667 * the batching benefits but not so much that we create dead time
668 * on the wire.
669 */
670 if (ic->i_unsignaled_wrs-- == 0) {
671 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
672 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
673 }
674
675 ic->i_unsignaled_bytes -= len;
676 if (ic->i_unsignaled_bytes <= 0) {
677 ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
678 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
679 }
680
681 /*
682 * Always signal the last one if we're stopping due to flow control.
683 */
684 if (flow_controlled && i == (work_alloc-1))
685 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
686
687 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
688 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
689
690 sent += len;
691 off += len;
692 if (off == ib_sg_dma_len(dev, scat)) {
693 scat++;
694 off = 0;
695 }
696
697add_header:
698 /* Tack on the header after the data. The header SGE should already
699 * have been set up to point to the right header buffer. */
700 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
701
702 if (0) {
703 struct rds_header *hdr = &ic->i_send_hdrs[pos];
704
705 printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
706 be16_to_cpu(hdr->h_dport),
707 hdr->h_flags,
708 be32_to_cpu(hdr->h_len));
709 }
710 if (adv_credits) {
711 struct rds_header *hdr = &ic->i_send_hdrs[pos];
712
713 /* add credit and redo the header checksum */
714 hdr->h_credit = adv_credits;
715 rds_message_make_checksum(hdr);
716 adv_credits = 0;
717 rds_iw_stats_inc(s_iw_tx_credit_updates);
718 }
719
720 if (prev)
721 prev->s_wr.next = &send->s_wr;
722 prev = send;
723
724 pos = (pos + 1) % ic->i_send_ring.w_nr;
725 }
726
727 /* Account the RDS header in the number of bytes we sent, but just once.
728 * The caller has no concept of fragmentation. */
729 if (hdr_off == 0)
730 sent += sizeof(struct rds_header);
731
732 /* if we finished the message then send completion owns it */
733 if (scat == &rm->m_sg[rm->m_count]) {
734 prev->s_rm = ic->i_rm;
735 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
736 ic->i_rm = NULL;
737 }
738
739 if (i < work_alloc) {
740 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
741 work_alloc = i;
742 }
743 if (ic->i_flowctl && i < credit_alloc)
744 rds_iw_send_add_credits(conn, credit_alloc - i);
745
746 /* XXX need to worry about failed_wr and partial sends. */
747 failed_wr = &first->s_wr;
748 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
749 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
750 first, &first->s_wr, ret, failed_wr);
751 BUG_ON(failed_wr != &first->s_wr);
752 if (ret) {
753 printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
754 "returned %d\n", &conn->c_faddr, ret);
755 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
756 if (prev->s_rm) {
757 ic->i_rm = prev->s_rm;
758 prev->s_rm = NULL;
759 }
760 goto out;
761 }
762
763 ret = sent;
764out:
765 BUG_ON(adv_credits);
766 return ret;
767}
768
769static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
770{
771 BUG_ON(nent > send->s_page_list->max_page_list_len);
772 /*
773 * Perform a WR for the fast_reg_mr. Each individual page
774 * in the sg list is added to the fast reg page list and placed
775 * inside the fast_reg_mr WR.
776 */
777 send->s_wr.opcode = IB_WR_FAST_REG_MR;
778 send->s_wr.wr.fast_reg.length = len;
779 send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
780 send->s_wr.wr.fast_reg.page_list = send->s_page_list;
781 send->s_wr.wr.fast_reg.page_list_len = nent;
782 send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
783 send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
784 send->s_wr.wr.fast_reg.iova_start = sg_addr;
785
786 ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
787}
788
789int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
790{
791 struct rds_iw_connection *ic = conn->c_transport_data;
792 struct rds_iw_send_work *send = NULL;
793 struct rds_iw_send_work *first;
794 struct rds_iw_send_work *prev;
795 struct ib_send_wr *failed_wr;
796 struct rds_iw_device *rds_iwdev;
797 struct scatterlist *scat;
798 unsigned long len;
799 u64 remote_addr = op->r_remote_addr;
800 u32 pos, fr_pos;
801 u32 work_alloc;
802 u32 i;
803 u32 j;
804 int sent;
805 int ret;
806 int num_sge;
807
808 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
809
810 /* map the message the first time we see it */
811 if (!op->r_mapped) {
812 op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
813 op->r_sg, op->r_nents, (op->r_write) ?
814 DMA_TO_DEVICE : DMA_FROM_DEVICE);
815 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
816 if (op->r_count == 0) {
817 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
818 ret = -ENOMEM; /* XXX ? */
819 goto out;
820 }
821
822 op->r_mapped = 1;
823 }
824
825 if (!op->r_write) {
826 /* Alloc space on the send queue for the fastreg */
827 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
828 if (work_alloc != 1) {
829 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
830 rds_iw_stats_inc(s_iw_tx_ring_full);
831 ret = -ENOMEM;
832 goto out;
833 }
834 }
835
836 /*
837 * Instead of knowing how to return a partial rdma read/write we insist that there
838 * be enough work requests to send the entire message.
839 */
840 i = ceil(op->r_count, rds_iwdev->max_sge);
841
842 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
843 if (work_alloc != i) {
844 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
845 rds_iw_stats_inc(s_iw_tx_ring_full);
846 ret = -ENOMEM;
847 goto out;
848 }
849
850 send = &ic->i_sends[pos];
851 if (!op->r_write) {
852 first = prev = &ic->i_sends[fr_pos];
853 } else {
854 first = send;
855 prev = NULL;
856 }
857 scat = &op->r_sg[0];
858 sent = 0;
859 num_sge = op->r_count;
860
861 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
862 send->s_wr.send_flags = 0;
863 send->s_queued = jiffies;
864
865 /*
866 * We want to delay signaling completions just enough to get
867 * the batching benefits but not so much that we create dead time on the wire.
868 */
869 if (ic->i_unsignaled_wrs-- == 0) {
870 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
871 send->s_wr.send_flags = IB_SEND_SIGNALED;
872 }
873
874 /* To avoid the need to have the plumbing to invalidate the fastreg_mr used
875 * for local access after RDS is finished with it, using
876 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
877 */
878 if (op->r_write)
879 send->s_wr.opcode = IB_WR_RDMA_WRITE;
880 else
881 send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
882
883 send->s_wr.wr.rdma.remote_addr = remote_addr;
884 send->s_wr.wr.rdma.rkey = op->r_key;
885 send->s_op = op;
886
887 if (num_sge > rds_iwdev->max_sge) {
888 send->s_wr.num_sge = rds_iwdev->max_sge;
889 num_sge -= rds_iwdev->max_sge;
890 } else
891 send->s_wr.num_sge = num_sge;
892
893 send->s_wr.next = NULL;
894
895 if (prev)
896 prev->s_wr.next = &send->s_wr;
897
898 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
899 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
900
901 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
902 send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
903 else {
904 send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
905 send->s_sge[j].length = len;
906 send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
907 }
908
909 sent += len;
910 rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
911 remote_addr += len;
912
913 scat++;
914 }
915
916 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
917 send->s_wr.num_sge = 1;
918 send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
919 send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
920 send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
921 }
922
923 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
924 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
925
926 prev = send;
927 if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
928 send = ic->i_sends;
929 }
930
931 /* if we finished the message then send completion owns it */
932 if (scat == &op->r_sg[op->r_count])
933 first->s_wr.send_flags = IB_SEND_SIGNALED;
934
935 if (i < work_alloc) {
936 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
937 work_alloc = i;
938 }
939
940 /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
941 * recommended. Putting the lkey on the wire is a security hole, as it can
942 * allow for memory access to all of memory on the remote system. Some
943 * adapters do not allow using the lkey for this at all. To bypass this use a
944 * fastreg_mr (or possibly a dma_mr)
945 */
946 if (!op->r_write) {
947 rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
948 op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
949 work_alloc++;
950 }
951
952 failed_wr = &first->s_wr;
953 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
954 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
955 first, &first->s_wr, ret, failed_wr);
956 BUG_ON(failed_wr != &first->s_wr);
957 if (ret) {
958 printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
959 "returned %d\n", &conn->c_faddr, ret);
960 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
961 goto out;
962 }
963
964out:
965 return ret;
966}
967
968void rds_iw_xmit_complete(struct rds_connection *conn)
969{
970 struct rds_iw_connection *ic = conn->c_transport_data;
971
972 /* We may have a pending ACK or window update we were unable
973 * to send previously (due to flow control). Try again. */
974 rds_iw_attempt_ack(ic);
975}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 000000000000..ccc7e8f0bf0e
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38#include "iw.h"
39
40DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
41
42static char *rds_iw_stat_names[] = {
43 "iw_connect_raced",
44 "iw_listen_closed_stale",
45 "iw_tx_cq_call",
46 "iw_tx_cq_event",
47 "iw_tx_ring_full",
48 "iw_tx_throttle",
49 "iw_tx_sg_mapping_failure",
50 "iw_tx_stalled",
51 "iw_tx_credit_updates",
52 "iw_rx_cq_call",
53 "iw_rx_cq_event",
54 "iw_rx_ring_empty",
55 "iw_rx_refill_from_cq",
56 "iw_rx_refill_from_thread",
57 "iw_rx_alloc_limit",
58 "iw_rx_credit_updates",
59 "iw_ack_sent",
60 "iw_ack_send_failure",
61 "iw_ack_send_delayed",
62 "iw_ack_send_piggybacked",
63 "iw_ack_received",
64 "iw_rdma_mr_alloc",
65 "iw_rdma_mr_free",
66 "iw_rdma_mr_used",
67 "iw_rdma_mr_pool_flush",
68 "iw_rdma_mr_pool_wait",
69 "iw_rdma_mr_pool_depleted",
70};
71
72unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
73 unsigned int avail)
74{
75 struct rds_iw_statistics stats = {0, };
76 uint64_t *src;
77 uint64_t *sum;
78 size_t i;
79 int cpu;
80
81 if (avail < ARRAY_SIZE(rds_iw_stat_names))
82 goto out;
83
84 for_each_online_cpu(cpu) {
85 src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
86 sum = (uint64_t *)&stats;
87 for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
88 *(sum++) += *(src++);
89 }
90
91 rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
92 ARRAY_SIZE(rds_iw_stat_names));
93out:
94 return ARRAY_SIZE(rds_iw_stat_names);
95}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 000000000000..9590678cd616
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,137 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/sysctl.h>
35#include <linux/proc_fs.h>
36
37#include "iw.h"
38
39static struct ctl_table_header *rds_iw_sysctl_hdr;
40
41unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
42unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
43unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
44static unsigned long rds_iw_sysctl_max_wr_min = 1;
45/* hardware will fail CQ creation long before this */
46static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
47
48unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
49static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
50static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
51
52unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
53static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
54static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
55
56unsigned int rds_iw_sysctl_flow_control = 1;
57
58ctl_table rds_iw_sysctl_table[] = {
59 {
60 .ctl_name = CTL_UNNUMBERED,
61 .procname = "max_send_wr",
62 .data = &rds_iw_sysctl_max_send_wr,
63 .maxlen = sizeof(unsigned long),
64 .mode = 0644,
65 .proc_handler = &proc_doulongvec_minmax,
66 .extra1 = &rds_iw_sysctl_max_wr_min,
67 .extra2 = &rds_iw_sysctl_max_wr_max,
68 },
69 {
70 .ctl_name = CTL_UNNUMBERED,
71 .procname = "max_recv_wr",
72 .data = &rds_iw_sysctl_max_recv_wr,
73 .maxlen = sizeof(unsigned long),
74 .mode = 0644,
75 .proc_handler = &proc_doulongvec_minmax,
76 .extra1 = &rds_iw_sysctl_max_wr_min,
77 .extra2 = &rds_iw_sysctl_max_wr_max,
78 },
79 {
80 .ctl_name = CTL_UNNUMBERED,
81 .procname = "max_unsignaled_wr",
82 .data = &rds_iw_sysctl_max_unsig_wrs,
83 .maxlen = sizeof(unsigned long),
84 .mode = 0644,
85 .proc_handler = &proc_doulongvec_minmax,
86 .extra1 = &rds_iw_sysctl_max_unsig_wr_min,
87 .extra2 = &rds_iw_sysctl_max_unsig_wr_max,
88 },
89 {
90 .ctl_name = CTL_UNNUMBERED,
91 .procname = "max_unsignaled_bytes",
92 .data = &rds_iw_sysctl_max_unsig_bytes,
93 .maxlen = sizeof(unsigned long),
94 .mode = 0644,
95 .proc_handler = &proc_doulongvec_minmax,
96 .extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
97 .extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
98 },
99 {
100 .ctl_name = CTL_UNNUMBERED,
101 .procname = "max_recv_allocation",
102 .data = &rds_iw_sysctl_max_recv_allocation,
103 .maxlen = sizeof(unsigned long),
104 .mode = 0644,
105 .proc_handler = &proc_doulongvec_minmax,
106 },
107 {
108 .ctl_name = CTL_UNNUMBERED,
109 .procname = "flow_control",
110 .data = &rds_iw_sysctl_flow_control,
111 .maxlen = sizeof(rds_iw_sysctl_flow_control),
112 .mode = 0644,
113 .proc_handler = &proc_dointvec,
114 },
115 { .ctl_name = 0}
116};
117
118static struct ctl_path rds_iw_sysctl_path[] = {
119 { .procname = "net", .ctl_name = CTL_NET, },
120 { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
121 { .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
122 { }
123};
124
125void rds_iw_sysctl_exit(void)
126{
127 if (rds_iw_sysctl_hdr)
128 unregister_sysctl_table(rds_iw_sysctl_hdr);
129}
130
131int __init rds_iw_sysctl_init(void)
132{
133 rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
134 if (rds_iw_sysctl_hdr == NULL)
135 return -ENOMEM;
136 return 0;
137}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 000000000000..4a61997f554d
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,188 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35
36#include "rds.h"
37#include "loop.h"
38
39static DEFINE_SPINLOCK(loop_conns_lock);
40static LIST_HEAD(loop_conns);
41
42/*
43 * This 'loopback' transport is a special case for flows that originate
44 * and terminate on the same machine.
45 *
46 * Connection build-up notices if the destination address is thought of
47 * as a local address by a transport. At that time it decides to use the
48 * loopback transport instead of the bound transport of the sending socket.
49 *
50 * The loopback transport's sending path just hands the sent rds_message
51 * straight to the receiving path via an embedded rds_incoming.
52 */
53
54/*
55 * Usually a message transits both the sender and receiver's conns as it
56 * flows to the receiver. In the loopback case, though, the receive path
57 * is handed the sending conn so the sense of the addresses is reversed.
58 */
59static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
60 unsigned int hdr_off, unsigned int sg,
61 unsigned int off)
62{
63 BUG_ON(hdr_off || sg || off);
64
65 rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
66 rds_message_addref(rm); /* for the inc */
67
68 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
69 GFP_KERNEL, KM_USER0);
70
71 rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
72 NULL);
73
74 rds_inc_put(&rm->m_inc);
75
76 return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
77}
78
79static int rds_loop_xmit_cong_map(struct rds_connection *conn,
80 struct rds_cong_map *map,
81 unsigned long offset)
82{
83 unsigned long i;
84
85 BUG_ON(offset);
86 BUG_ON(map != conn->c_lcong);
87
88 for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
89 memcpy((void *)conn->c_fcong->m_page_addrs[i],
90 (void *)map->m_page_addrs[i], PAGE_SIZE);
91 }
92
93 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
94
95 return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
96}
97
98/* we need to at least give the thread something to succeed */
99static int rds_loop_recv(struct rds_connection *conn)
100{
101 return 0;
102}
103
104struct rds_loop_connection {
105 struct list_head loop_node;
106 struct rds_connection *conn;
107};
108
109/*
110 * Even the loopback transport needs to keep track of its connections,
111 * so it can call rds_conn_destroy() on them on exit. N.B. there are
112 * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
113 * multiple loopback conns allocated, although rather useless.
114 */
115static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
116{
117 struct rds_loop_connection *lc;
118 unsigned long flags;
119
120 lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
121 if (lc == NULL)
122 return -ENOMEM;
123
124 INIT_LIST_HEAD(&lc->loop_node);
125 lc->conn = conn;
126 conn->c_transport_data = lc;
127
128 spin_lock_irqsave(&loop_conns_lock, flags);
129 list_add_tail(&lc->loop_node, &loop_conns);
130 spin_unlock_irqrestore(&loop_conns_lock, flags);
131
132 return 0;
133}
134
135static void rds_loop_conn_free(void *arg)
136{
137 struct rds_loop_connection *lc = arg;
138 rdsdebug("lc %p\n", lc);
139 list_del(&lc->loop_node);
140 kfree(lc);
141}
142
143static int rds_loop_conn_connect(struct rds_connection *conn)
144{
145 rds_connect_complete(conn);
146 return 0;
147}
148
149static void rds_loop_conn_shutdown(struct rds_connection *conn)
150{
151}
152
153void rds_loop_exit(void)
154{
155 struct rds_loop_connection *lc, *_lc;
156 LIST_HEAD(tmp_list);
157
158 /* avoid calling conn_destroy with irqs off */
159 spin_lock_irq(&loop_conns_lock);
160 list_splice(&loop_conns, &tmp_list);
161 INIT_LIST_HEAD(&loop_conns);
162 spin_unlock_irq(&loop_conns_lock);
163
164 list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
165 WARN_ON(lc->conn->c_passive);
166 rds_conn_destroy(lc->conn);
167 }
168}
169
170/*
171 * This is missing .xmit_* because loop doesn't go through generic
172 * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and
173 * .laddr_check are missing because transport.c doesn't iterate over
174 * rds_loop_transport.
175 */
176struct rds_transport rds_loop_transport = {
177 .xmit = rds_loop_xmit,
178 .xmit_cong_map = rds_loop_xmit_cong_map,
179 .recv = rds_loop_recv,
180 .conn_alloc = rds_loop_conn_alloc,
181 .conn_free = rds_loop_conn_free,
182 .conn_connect = rds_loop_conn_connect,
183 .conn_shutdown = rds_loop_conn_shutdown,
184 .inc_copy_to_user = rds_message_inc_copy_to_user,
185 .inc_purge = rds_message_inc_purge,
186 .inc_free = rds_message_inc_free,
187 .t_name = "loopback",
188};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 000000000000..f32b0939a04d
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
1#ifndef _RDS_LOOP_H
2#define _RDS_LOOP_H
3
4/* loop.c */
5extern struct rds_transport rds_loop_transport;
6
7void rds_loop_exit(void);
8
9#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 000000000000..5a15dc8d0cd7
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,402 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "rdma.h"
37
38static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
39
40static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
41[RDS_EXTHDR_NONE] = 0,
42[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
43[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
44[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
45};
46
47
48void rds_message_addref(struct rds_message *rm)
49{
50 rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
51 atomic_inc(&rm->m_refcount);
52}
53
54/*
55 * This relies on dma_map_sg() not touching sg[].page during merging.
56 */
57static void rds_message_purge(struct rds_message *rm)
58{
59 unsigned long i;
60
61 if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
62 return;
63
64 for (i = 0; i < rm->m_nents; i++) {
65 rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
66 /* XXX will have to put_page for page refs */
67 __free_page(sg_page(&rm->m_sg[i]));
68 }
69 rm->m_nents = 0;
70
71 if (rm->m_rdma_op)
72 rds_rdma_free_op(rm->m_rdma_op);
73 if (rm->m_rdma_mr)
74 rds_mr_put(rm->m_rdma_mr);
75}
76
77void rds_message_inc_purge(struct rds_incoming *inc)
78{
79 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
80 rds_message_purge(rm);
81}
82
83void rds_message_put(struct rds_message *rm)
84{
85 rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
86
87 if (atomic_dec_and_test(&rm->m_refcount)) {
88 BUG_ON(!list_empty(&rm->m_sock_item));
89 BUG_ON(!list_empty(&rm->m_conn_item));
90 rds_message_purge(rm);
91
92 kfree(rm);
93 }
94}
95
96void rds_message_inc_free(struct rds_incoming *inc)
97{
98 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
99 rds_message_put(rm);
100}
101
102void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
103 __be16 dport, u64 seq)
104{
105 hdr->h_flags = 0;
106 hdr->h_sport = sport;
107 hdr->h_dport = dport;
108 hdr->h_sequence = cpu_to_be64(seq);
109 hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
110}
111
112int rds_message_add_extension(struct rds_header *hdr,
113 unsigned int type, const void *data, unsigned int len)
114{
115 unsigned int ext_len = sizeof(u8) + len;
116 unsigned char *dst;
117
118 /* For now, refuse to add more than one extension header */
119 if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
120 return 0;
121
122 if (type >= __RDS_EXTHDR_MAX
123 || len != rds_exthdr_size[type])
124 return 0;
125
126 if (ext_len >= RDS_HEADER_EXT_SPACE)
127 return 0;
128 dst = hdr->h_exthdr;
129
130 *dst++ = type;
131 memcpy(dst, data, len);
132
133 dst[len] = RDS_EXTHDR_NONE;
134 return 1;
135}
136
137/*
138 * If a message has extension headers, retrieve them here.
139 * Call like this:
140 *
141 * unsigned int pos = 0;
142 *
143 * while (1) {
144 * buflen = sizeof(buffer);
145 * type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
146 * if (type == RDS_EXTHDR_NONE)
147 * break;
148 * ...
149 * }
150 */
151int rds_message_next_extension(struct rds_header *hdr,
152 unsigned int *pos, void *buf, unsigned int *buflen)
153{
154 unsigned int offset, ext_type, ext_len;
155 u8 *src = hdr->h_exthdr;
156
157 offset = *pos;
158 if (offset >= RDS_HEADER_EXT_SPACE)
159 goto none;
160
161 /* Get the extension type and length. For now, the
162 * length is implied by the extension type. */
163 ext_type = src[offset++];
164
165 if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
166 goto none;
167 ext_len = rds_exthdr_size[ext_type];
168 if (offset + ext_len > RDS_HEADER_EXT_SPACE)
169 goto none;
170
171 *pos = offset + ext_len;
172 if (ext_len < *buflen)
173 *buflen = ext_len;
174 memcpy(buf, src + offset, *buflen);
175 return ext_type;
176
177none:
178 *pos = RDS_HEADER_EXT_SPACE;
179 *buflen = 0;
180 return RDS_EXTHDR_NONE;
181}
182
183int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
184{
185 struct rds_ext_header_version ext_hdr;
186
187 ext_hdr.h_version = cpu_to_be32(version);
188 return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
189}
190
191int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
192{
193 struct rds_ext_header_version ext_hdr;
194 unsigned int pos = 0, len = sizeof(ext_hdr);
195
196 /* We assume the version extension is the only one present */
197 if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
198 return 0;
199 *version = be32_to_cpu(ext_hdr.h_version);
200 return 1;
201}
202
203int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
204{
205 struct rds_ext_header_rdma_dest ext_hdr;
206
207 ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
208 ext_hdr.h_rdma_offset = cpu_to_be32(offset);
209 return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
210}
211
212struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
213{
214 struct rds_message *rm;
215
216 rm = kzalloc(sizeof(struct rds_message) +
217 (nents * sizeof(struct scatterlist)), gfp);
218 if (!rm)
219 goto out;
220
221 if (nents)
222 sg_init_table(rm->m_sg, nents);
223 atomic_set(&rm->m_refcount, 1);
224 INIT_LIST_HEAD(&rm->m_sock_item);
225 INIT_LIST_HEAD(&rm->m_conn_item);
226 spin_lock_init(&rm->m_rs_lock);
227
228out:
229 return rm;
230}
231
232struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
233{
234 struct rds_message *rm;
235 unsigned int i;
236
237 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
238 if (rm == NULL)
239 return ERR_PTR(-ENOMEM);
240
241 set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
242 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
243 rm->m_nents = ceil(total_len, PAGE_SIZE);
244
245 for (i = 0; i < rm->m_nents; ++i) {
246 sg_set_page(&rm->m_sg[i],
247 virt_to_page(page_addrs[i]),
248 PAGE_SIZE, 0);
249 }
250
251 return rm;
252}
253
254struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
255 size_t total_len)
256{
257 unsigned long to_copy;
258 unsigned long iov_off;
259 unsigned long sg_off;
260 struct rds_message *rm;
261 struct iovec *iov;
262 struct scatterlist *sg;
263 int ret;
264
265 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
266 if (rm == NULL) {
267 ret = -ENOMEM;
268 goto out;
269 }
270
271 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
272
273 /*
274 * now allocate and copy in the data payload.
275 */
276 sg = rm->m_sg;
277 iov = first_iov;
278 iov_off = 0;
279 sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
280
281 while (total_len) {
282 if (sg_page(sg) == NULL) {
283 ret = rds_page_remainder_alloc(sg, total_len,
284 GFP_HIGHUSER);
285 if (ret)
286 goto out;
287 rm->m_nents++;
288 sg_off = 0;
289 }
290
291 while (iov_off == iov->iov_len) {
292 iov_off = 0;
293 iov++;
294 }
295
296 to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
297 to_copy = min_t(size_t, to_copy, total_len);
298
299 rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
300 "sg [%p, %u, %u] + %lu\n",
301 to_copy, iov->iov_base, iov->iov_len, iov_off,
302 (void *)sg_page(sg), sg->offset, sg->length, sg_off);
303
304 ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
305 iov->iov_base + iov_off,
306 to_copy);
307 if (ret)
308 goto out;
309
310 iov_off += to_copy;
311 total_len -= to_copy;
312 sg_off += to_copy;
313
314 if (sg_off == sg->length)
315 sg++;
316 }
317
318 ret = 0;
319out:
320 if (ret) {
321 if (rm)
322 rds_message_put(rm);
323 rm = ERR_PTR(ret);
324 }
325 return rm;
326}
327
328int rds_message_inc_copy_to_user(struct rds_incoming *inc,
329 struct iovec *first_iov, size_t size)
330{
331 struct rds_message *rm;
332 struct iovec *iov;
333 struct scatterlist *sg;
334 unsigned long to_copy;
335 unsigned long iov_off;
336 unsigned long vec_off;
337 int copied;
338 int ret;
339 u32 len;
340
341 rm = container_of(inc, struct rds_message, m_inc);
342 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
343
344 iov = first_iov;
345 iov_off = 0;
346 sg = rm->m_sg;
347 vec_off = 0;
348 copied = 0;
349
350 while (copied < size && copied < len) {
351 while (iov_off == iov->iov_len) {
352 iov_off = 0;
353 iov++;
354 }
355
356 to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
357 to_copy = min_t(size_t, to_copy, size - copied);
358 to_copy = min_t(unsigned long, to_copy, len - copied);
359
360 rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
361 "sg [%p, %u, %u] + %lu\n",
362 to_copy, iov->iov_base, iov->iov_len, iov_off,
363 sg_page(sg), sg->offset, sg->length, vec_off);
364
365 ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
366 iov->iov_base + iov_off,
367 to_copy);
368 if (ret) {
369 copied = ret;
370 break;
371 }
372
373 iov_off += to_copy;
374 vec_off += to_copy;
375 copied += to_copy;
376
377 if (vec_off == sg->length) {
378 vec_off = 0;
379 sg++;
380 }
381 }
382
383 return copied;
384}
385
386/*
387 * If the message is still on the send queue, wait until the transport
388 * is done with it. This is particularly important for RDMA operations.
389 */
390void rds_message_wait(struct rds_message *rm)
391{
392 wait_event(rds_message_flush_waitq,
393 !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
394}
395
396void rds_message_unmapped(struct rds_message *rm)
397{
398 clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
399 if (waitqueue_active(&rds_message_flush_waitq))
400 wake_up(&rds_message_flush_waitq);
401}
402
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 000000000000..c460743a89ad
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,221 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/highmem.h>
34
35#include "rds.h"
36
37struct rds_page_remainder {
38 struct page *r_page;
39 unsigned long r_offset;
40};
41
42DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
43
44/*
45 * returns 0 on success or -errno on failure.
46 *
47 * We don't have to worry about flush_dcache_page() as this only works
48 * with private pages. If, say, we were to do directed receive to pinned
49 * user pages we'd have to worry more about cache coherence. (Though
50 * the flush_dcache_page() in get_user_pages() would probably be enough).
51 */
52int rds_page_copy_user(struct page *page, unsigned long offset,
53 void __user *ptr, unsigned long bytes,
54 int to_user)
55{
56 unsigned long ret;
57 void *addr;
58
59 if (to_user)
60 rds_stats_add(s_copy_to_user, bytes);
61 else
62 rds_stats_add(s_copy_from_user, bytes);
63
64 addr = kmap_atomic(page, KM_USER0);
65 if (to_user)
66 ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
67 else
68 ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
69 kunmap_atomic(addr, KM_USER0);
70
71 if (ret) {
72 addr = kmap(page);
73 if (to_user)
74 ret = copy_to_user(ptr, addr + offset, bytes);
75 else
76 ret = copy_from_user(addr + offset, ptr, bytes);
77 kunmap(page);
78 if (ret)
79 return -EFAULT;
80 }
81
82 return 0;
83}
84
85/*
86 * Message allocation uses this to build up regions of a message.
87 *
88 * @bytes - the number of bytes needed.
89 * @gfp - the waiting behaviour of the allocation
90 *
91 * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to
92 * kmap the pages, etc.
93 *
94 * If @bytes is at least a full page then this just returns a page from
95 * alloc_page().
96 *
97 * If @bytes is a partial page then this stores the unused region of the
98 * page in a per-cpu structure. Future partial-page allocations may be
99 * satisfied from that cached region. This lets us waste less memory on
100 * small allocations with minimal complexity. It works because the transmit
101 * path passes read-only page regions down to devices. They hold a page
102 * reference until they are done with the region.
103 */
104int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
105 gfp_t gfp)
106{
107 struct rds_page_remainder *rem;
108 unsigned long flags;
109 struct page *page;
110 int ret;
111
112 gfp |= __GFP_HIGHMEM;
113
114 /* jump straight to allocation if we're trying for a huge page */
115 if (bytes >= PAGE_SIZE) {
116 page = alloc_page(gfp);
117 if (page == NULL) {
118 ret = -ENOMEM;
119 } else {
120 sg_set_page(scat, page, PAGE_SIZE, 0);
121 ret = 0;
122 }
123 goto out;
124 }
125
126 rem = &per_cpu(rds_page_remainders, get_cpu());
127 local_irq_save(flags);
128
129 while (1) {
130 /* avoid a tiny region getting stuck by tossing it */
131 if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
132 rds_stats_inc(s_page_remainder_miss);
133 __free_page(rem->r_page);
134 rem->r_page = NULL;
135 }
136
137 /* hand out a fragment from the cached page */
138 if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
139 sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
140 get_page(sg_page(scat));
141
142 if (rem->r_offset != 0)
143 rds_stats_inc(s_page_remainder_hit);
144
145 rem->r_offset += bytes;
146 if (rem->r_offset == PAGE_SIZE) {
147 __free_page(rem->r_page);
148 rem->r_page = NULL;
149 }
150 ret = 0;
151 break;
152 }
153
154 /* alloc if there is nothing for us to use */
155 local_irq_restore(flags);
156 put_cpu();
157
158 page = alloc_page(gfp);
159
160 rem = &per_cpu(rds_page_remainders, get_cpu());
161 local_irq_save(flags);
162
163 if (page == NULL) {
164 ret = -ENOMEM;
165 break;
166 }
167
168 /* did someone race to fill the remainder before us? */
169 if (rem->r_page) {
170 __free_page(page);
171 continue;
172 }
173
174 /* otherwise install our page and loop around to alloc */
175 rem->r_page = page;
176 rem->r_offset = 0;
177 }
178
179 local_irq_restore(flags);
180 put_cpu();
181out:
182 rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
183 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
184 ret ? 0 : scat->length);
185 return ret;
186}
187
188static int rds_page_remainder_cpu_notify(struct notifier_block *self,
189 unsigned long action, void *hcpu)
190{
191 struct rds_page_remainder *rem;
192 long cpu = (long)hcpu;
193
194 rem = &per_cpu(rds_page_remainders, cpu);
195
196 rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
197
198 switch (action) {
199 case CPU_DEAD:
200 if (rem->r_page)
201 __free_page(rem->r_page);
202 rem->r_page = NULL;
203 break;
204 }
205
206 return 0;
207}
208
209static struct notifier_block rds_page_remainder_nb = {
210 .notifier_call = rds_page_remainder_cpu_notify,
211};
212
213void rds_page_exit(void)
214{
215 int i;
216
217 for_each_possible_cpu(i)
218 rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
219 (unsigned long)CPU_DEAD,
220 (void *)(long)i);
221}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 000000000000..eaeeb91e1119
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,679 @@
1/*
2 * Copyright (c) 2007 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/pagemap.h>
34#include <linux/rbtree.h>
35#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
36
37#include "rdma.h"
38
39/*
40 * XXX
41 * - build with sparse
42 * - should we limit the size of a mr region? let transport return failure?
43 * - should we detect duplicate keys on a socket? hmm.
44 * - an rdma is an mlock, apply rlimit?
45 */
46
47/*
48 * get the number of pages by looking at the page indices that the start and
49 * end addresses fall in.
50 *
51 * Returns 0 if the vec is invalid. It is invalid if the number of bytes
52 * causes the address to wrap or overflows an unsigned int. This comes
53 * from being stored in the 'length' member of 'struct scatterlist'.
54 */
55static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
56{
57 if ((vec->addr + vec->bytes <= vec->addr) ||
58 (vec->bytes > (u64)UINT_MAX))
59 return 0;
60
61 return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
62 (vec->addr >> PAGE_SHIFT);
63}
64
65static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
66 struct rds_mr *insert)
67{
68 struct rb_node **p = &root->rb_node;
69 struct rb_node *parent = NULL;
70 struct rds_mr *mr;
71
72 while (*p) {
73 parent = *p;
74 mr = rb_entry(parent, struct rds_mr, r_rb_node);
75
76 if (key < mr->r_key)
77 p = &(*p)->rb_left;
78 else if (key > mr->r_key)
79 p = &(*p)->rb_right;
80 else
81 return mr;
82 }
83
84 if (insert) {
85 rb_link_node(&insert->r_rb_node, parent, p);
86 rb_insert_color(&insert->r_rb_node, root);
87 atomic_inc(&insert->r_refcount);
88 }
89 return NULL;
90}
91
92/*
93 * Destroy the transport-specific part of a MR.
94 */
95static void rds_destroy_mr(struct rds_mr *mr)
96{
97 struct rds_sock *rs = mr->r_sock;
98 void *trans_private = NULL;
99 unsigned long flags;
100
101 rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
102 mr->r_key, atomic_read(&mr->r_refcount));
103
104 if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
105 return;
106
107 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
108 if (!RB_EMPTY_NODE(&mr->r_rb_node))
109 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
110 trans_private = mr->r_trans_private;
111 mr->r_trans_private = NULL;
112 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
113
114 if (trans_private)
115 mr->r_trans->free_mr(trans_private, mr->r_invalidate);
116}
117
118void __rds_put_mr_final(struct rds_mr *mr)
119{
120 rds_destroy_mr(mr);
121 kfree(mr);
122}
123
124/*
125 * By the time this is called we can't have any more ioctls called on
126 * the socket so we don't need to worry about racing with others.
127 */
128void rds_rdma_drop_keys(struct rds_sock *rs)
129{
130 struct rds_mr *mr;
131 struct rb_node *node;
132
133 /* Release any MRs associated with this socket */
134 while ((node = rb_first(&rs->rs_rdma_keys))) {
135 mr = container_of(node, struct rds_mr, r_rb_node);
136 if (mr->r_trans == rs->rs_transport)
137 mr->r_invalidate = 0;
138 rds_mr_put(mr);
139 }
140
141 if (rs->rs_transport && rs->rs_transport->flush_mrs)
142 rs->rs_transport->flush_mrs();
143}
144
145/*
146 * Helper function to pin user pages.
147 */
148static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
149 struct page **pages, int write)
150{
151 int ret;
152
153 down_read(&current->mm->mmap_sem);
154 ret = get_user_pages(current, current->mm, user_addr,
155 nr_pages, write, 0, pages, NULL);
156 up_read(&current->mm->mmap_sem);
157
158 if (0 <= ret && (unsigned) ret < nr_pages) {
159 while (ret--)
160 put_page(pages[ret]);
161 ret = -EFAULT;
162 }
163
164 return ret;
165}
166
167static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
168 u64 *cookie_ret, struct rds_mr **mr_ret)
169{
170 struct rds_mr *mr = NULL, *found;
171 unsigned int nr_pages;
172 struct page **pages = NULL;
173 struct scatterlist *sg;
174 void *trans_private;
175 unsigned long flags;
176 rds_rdma_cookie_t cookie;
177 unsigned int nents;
178 long i;
179 int ret;
180
181 if (rs->rs_bound_addr == 0) {
182 ret = -ENOTCONN; /* XXX not a great errno */
183 goto out;
184 }
185
186 if (rs->rs_transport->get_mr == NULL) {
187 ret = -EOPNOTSUPP;
188 goto out;
189 }
190
191 nr_pages = rds_pages_in_vec(&args->vec);
192 if (nr_pages == 0) {
193 ret = -EINVAL;
194 goto out;
195 }
196
197 rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
198 args->vec.addr, args->vec.bytes, nr_pages);
199
200 /* XXX clamp nr_pages to limit the size of this alloc? */
201 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
202 if (pages == NULL) {
203 ret = -ENOMEM;
204 goto out;
205 }
206
207 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
208 if (mr == NULL) {
209 ret = -ENOMEM;
210 goto out;
211 }
212
213 atomic_set(&mr->r_refcount, 1);
214 RB_CLEAR_NODE(&mr->r_rb_node);
215 mr->r_trans = rs->rs_transport;
216 mr->r_sock = rs;
217
218 if (args->flags & RDS_RDMA_USE_ONCE)
219 mr->r_use_once = 1;
220 if (args->flags & RDS_RDMA_INVALIDATE)
221 mr->r_invalidate = 1;
222 if (args->flags & RDS_RDMA_READWRITE)
223 mr->r_write = 1;
224
225 /*
226 * Pin the pages that make up the user buffer and transfer the page
227 * pointers to the mr's sg array. We check to see if we've mapped
228 * the whole region after transferring the partial page references
229 * to the sg array so that we can have one page ref cleanup path.
230 *
231 * For now we have no flag that tells us whether the mapping is
232 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
233 * the zero page.
234 */
235 ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
236 if (ret < 0)
237 goto out;
238
239 nents = ret;
240 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
241 if (sg == NULL) {
242 ret = -ENOMEM;
243 goto out;
244 }
245 WARN_ON(!nents);
246 sg_init_table(sg, nents);
247
248 /* Stick all pages into the scatterlist */
249 for (i = 0 ; i < nents; i++)
250 sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
251
252 rdsdebug("RDS: trans_private nents is %u\n", nents);
253
254 /* Obtain a transport specific MR. If this succeeds, the
255 * s/g list is now owned by the MR.
256 * Note that dma_map() implies that pending writes are
257 * flushed to RAM, so no dma_sync is needed here. */
258 trans_private = rs->rs_transport->get_mr(sg, nents, rs,
259 &mr->r_key);
260
261 if (IS_ERR(trans_private)) {
262 for (i = 0 ; i < nents; i++)
263 put_page(sg_page(&sg[i]));
264 kfree(sg);
265 ret = PTR_ERR(trans_private);
266 goto out;
267 }
268
269 mr->r_trans_private = trans_private;
270
271 rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
272 mr->r_key, (void *)(unsigned long) args->cookie_addr);
273
274 /* The user may pass us an unaligned address, but we can only
275 * map page aligned regions. So we keep the offset, and build
276 * a 64bit cookie containing <R_Key, offset> and pass that
277 * around. */
278 cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
279 if (cookie_ret)
280 *cookie_ret = cookie;
281
282 if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
283 ret = -EFAULT;
284 goto out;
285 }
286
287 /* Inserting the new MR into the rbtree bumps its
288 * reference count. */
289 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
290 found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
291 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
292
293 BUG_ON(found && found != mr);
294
295 rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
296 if (mr_ret) {
297 atomic_inc(&mr->r_refcount);
298 *mr_ret = mr;
299 }
300
301 ret = 0;
302out:
303 kfree(pages);
304 if (mr)
305 rds_mr_put(mr);
306 return ret;
307}
308
309int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
310{
311 struct rds_get_mr_args args;
312
313 if (optlen != sizeof(struct rds_get_mr_args))
314 return -EINVAL;
315
316 if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
317 sizeof(struct rds_get_mr_args)))
318 return -EFAULT;
319
320 return __rds_rdma_map(rs, &args, NULL, NULL);
321}
322
323/*
324 * Free the MR indicated by the given R_Key
325 */
326int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
327{
328 struct rds_free_mr_args args;
329 struct rds_mr *mr;
330 unsigned long flags;
331
332 if (optlen != sizeof(struct rds_free_mr_args))
333 return -EINVAL;
334
335 if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
336 sizeof(struct rds_free_mr_args)))
337 return -EFAULT;
338
339 /* Special case - a null cookie means flush all unused MRs */
340 if (args.cookie == 0) {
341 if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
342 return -EINVAL;
343 rs->rs_transport->flush_mrs();
344 return 0;
345 }
346
347 /* Look up the MR given its R_key and remove it from the rbtree
348 * so nobody else finds it.
349 * This should also prevent races with rds_rdma_unuse.
350 */
351 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
352 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
353 if (mr) {
354 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
355 RB_CLEAR_NODE(&mr->r_rb_node);
356 if (args.flags & RDS_RDMA_INVALIDATE)
357 mr->r_invalidate = 1;
358 }
359 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
360
361 if (!mr)
362 return -EINVAL;
363
364 /*
365 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
366 * we return. If we let rds_mr_put() do it it might not happen until
367 * someone else drops their ref.
368 */
369 rds_destroy_mr(mr);
370 rds_mr_put(mr);
371 return 0;
372}
373
374/*
375 * This is called when we receive an extension header that
376 * tells us this MR was used. It allows us to implement
377 * use_once semantics
378 */
379void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
380{
381 struct rds_mr *mr;
382 unsigned long flags;
383 int zot_me = 0;
384
385 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
386 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
387 if (mr && (mr->r_use_once || force)) {
388 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
389 RB_CLEAR_NODE(&mr->r_rb_node);
390 zot_me = 1;
391 } else if (mr)
392 atomic_inc(&mr->r_refcount);
393 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
394
395 /* May have to issue a dma_sync on this memory region.
396 * Note we could avoid this if the operation was a RDMA READ,
397 * but at this point we can't tell. */
398 if (mr != NULL) {
399 if (mr->r_trans->sync_mr)
400 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
401
402 /* If the MR was marked as invalidate, this will
403 * trigger an async flush. */
404 if (zot_me)
405 rds_destroy_mr(mr);
406 rds_mr_put(mr);
407 }
408}
409
410void rds_rdma_free_op(struct rds_rdma_op *ro)
411{
412 unsigned int i;
413
414 for (i = 0; i < ro->r_nents; i++) {
415 struct page *page = sg_page(&ro->r_sg[i]);
416
417 /* Mark page dirty if it was possibly modified, which
418 * is the case for a RDMA_READ which copies from remote
419 * to local memory */
420 if (!ro->r_write)
421 set_page_dirty(page);
422 put_page(page);
423 }
424
425 kfree(ro->r_notifier);
426 kfree(ro);
427}
428
429/*
430 * args is a pointer to an in-kernel copy in the sendmsg cmsg.
431 */
432static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
433 struct rds_rdma_args *args)
434{
435 struct rds_iovec vec;
436 struct rds_rdma_op *op = NULL;
437 unsigned int nr_pages;
438 unsigned int max_pages;
439 unsigned int nr_bytes;
440 struct page **pages = NULL;
441 struct rds_iovec __user *local_vec;
442 struct scatterlist *sg;
443 unsigned int nr;
444 unsigned int i, j;
445 int ret;
446
447
448 if (rs->rs_bound_addr == 0) {
449 ret = -ENOTCONN; /* XXX not a great errno */
450 goto out;
451 }
452
453 if (args->nr_local > (u64)UINT_MAX) {
454 ret = -EMSGSIZE;
455 goto out;
456 }
457
458 nr_pages = 0;
459 max_pages = 0;
460
461 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
462
463 /* figure out the number of pages in the vector */
464 for (i = 0; i < args->nr_local; i++) {
465 if (copy_from_user(&vec, &local_vec[i],
466 sizeof(struct rds_iovec))) {
467 ret = -EFAULT;
468 goto out;
469 }
470
471 nr = rds_pages_in_vec(&vec);
472 if (nr == 0) {
473 ret = -EINVAL;
474 goto out;
475 }
476
477 max_pages = max(nr, max_pages);
478 nr_pages += nr;
479 }
480
481 pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
482 if (pages == NULL) {
483 ret = -ENOMEM;
484 goto out;
485 }
486
487 op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
488 if (op == NULL) {
489 ret = -ENOMEM;
490 goto out;
491 }
492
493 op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
494 op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
495 op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
496 op->r_recverr = rs->rs_recverr;
497 WARN_ON(!nr_pages);
498 sg_init_table(op->r_sg, nr_pages);
499
500 if (op->r_notify || op->r_recverr) {
501 /* We allocate an uninitialized notifier here, because
502 * we don't want to do that in the completion handler. We
503 * would have to use GFP_ATOMIC there, and don't want to deal
504 * with failed allocations.
505 */
506 op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
507 if (!op->r_notifier) {
508 ret = -ENOMEM;
509 goto out;
510 }
511 op->r_notifier->n_user_token = args->user_token;
512 op->r_notifier->n_status = RDS_RDMA_SUCCESS;
513 }
514
515 /* The cookie contains the R_Key of the remote memory region, and
516 * optionally an offset into it. This is how we implement RDMA into
517 * unaligned memory.
518 * When setting up the RDMA, we need to add that offset to the
519 * destination address (which is really an offset into the MR)
520 * FIXME: We may want to move this into ib_rdma.c
521 */
522 op->r_key = rds_rdma_cookie_key(args->cookie);
523 op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
524
525 nr_bytes = 0;
526
527 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
528 (unsigned long long)args->nr_local,
529 (unsigned long long)args->remote_vec.addr,
530 op->r_key);
531
532 for (i = 0; i < args->nr_local; i++) {
533 if (copy_from_user(&vec, &local_vec[i],
534 sizeof(struct rds_iovec))) {
535 ret = -EFAULT;
536 goto out;
537 }
538
539 nr = rds_pages_in_vec(&vec);
540 if (nr == 0) {
541 ret = -EINVAL;
542 goto out;
543 }
544
545 rs->rs_user_addr = vec.addr;
546 rs->rs_user_bytes = vec.bytes;
547
548 /* did the user change the vec under us? */
549 if (nr > max_pages || op->r_nents + nr > nr_pages) {
550 ret = -EINVAL;
551 goto out;
552 }
553 /* If it's a WRITE operation, we want to pin the pages for reading.
554 * If it's a READ operation, we need to pin the pages for writing.
555 */
556 ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
557 if (ret < 0)
558 goto out;
559
560 rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
561 nr_bytes, nr, vec.bytes, vec.addr);
562
563 nr_bytes += vec.bytes;
564
565 for (j = 0; j < nr; j++) {
566 unsigned int offset = vec.addr & ~PAGE_MASK;
567
568 sg = &op->r_sg[op->r_nents + j];
569 sg_set_page(sg, pages[j],
570 min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
571 offset);
572
573 rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
574 sg->offset, sg->length, vec.addr, vec.bytes);
575
576 vec.addr += sg->length;
577 vec.bytes -= sg->length;
578 }
579
580 op->r_nents += nr;
581 }
582
583
584 if (nr_bytes > args->remote_vec.bytes) {
585 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
586 nr_bytes,
587 (unsigned int) args->remote_vec.bytes);
588 ret = -EINVAL;
589 goto out;
590 }
591 op->r_bytes = nr_bytes;
592
593 ret = 0;
594out:
595 kfree(pages);
596 if (ret) {
597 if (op)
598 rds_rdma_free_op(op);
599 op = ERR_PTR(ret);
600 }
601 return op;
602}
603
604/*
605 * The application asks for a RDMA transfer.
606 * Extract all arguments and set up the rdma_op
607 */
608int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
609 struct cmsghdr *cmsg)
610{
611 struct rds_rdma_op *op;
612
613 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
614 || rm->m_rdma_op != NULL)
615 return -EINVAL;
616
617 op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
618 if (IS_ERR(op))
619 return PTR_ERR(op);
620 rds_stats_inc(s_send_rdma);
621 rm->m_rdma_op = op;
622 return 0;
623}
624
625/*
626 * The application wants us to pass an RDMA destination (aka MR)
627 * to the remote
628 */
629int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
630 struct cmsghdr *cmsg)
631{
632 unsigned long flags;
633 struct rds_mr *mr;
634 u32 r_key;
635 int err = 0;
636
637 if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
638 || rm->m_rdma_cookie != 0)
639 return -EINVAL;
640
641 memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
642
643 /* We are reusing a previously mapped MR here. Most likely, the
644 * application has written to the buffer, so we need to explicitly
645 * flush those writes to RAM. Otherwise the HCA may not see them
646 * when doing a DMA from that buffer.
647 */
648 r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
649
650 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
651 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
652 if (mr == NULL)
653 err = -EINVAL; /* invalid r_key */
654 else
655 atomic_inc(&mr->r_refcount);
656 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
657
658 if (mr) {
659 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
660 rm->m_rdma_mr = mr;
661 }
662 return err;
663}
664
665/*
666 * The application passes us an address range it wants to enable RDMA
667 * to/from. We map the area, and save the <R_Key,offset> pair
668 * in rm->m_rdma_cookie. This causes it to be sent along to the peer
669 * in an extension header.
670 */
671int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
672 struct cmsghdr *cmsg)
673{
674 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
675 || rm->m_rdma_cookie != 0)
676 return -EINVAL;
677
678 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
679}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
new file mode 100644
index 000000000000..425512098b0b
--- /dev/null
+++ b/net/rds/rdma.h
@@ -0,0 +1,84 @@
1#ifndef _RDS_RDMA_H
2#define _RDS_RDMA_H
3
4#include <linux/rbtree.h>
5#include <linux/spinlock.h>
6#include <linux/scatterlist.h>
7
8#include "rds.h"
9
10struct rds_mr {
11 struct rb_node r_rb_node;
12 atomic_t r_refcount;
13 u32 r_key;
14
15 /* A copy of the creation flags */
16 unsigned int r_use_once:1;
17 unsigned int r_invalidate:1;
18 unsigned int r_write:1;
19
20 /* This is for RDS_MR_DEAD.
21 * It would be nice & consistent to make this part of the above
22 * bit field here, but we need to use test_and_set_bit.
23 */
24 unsigned long r_state;
25 struct rds_sock *r_sock; /* back pointer to the socket that owns us */
26 struct rds_transport *r_trans;
27 void *r_trans_private;
28};
29
30/* Flags for mr->r_state */
31#define RDS_MR_DEAD 0
32
33struct rds_rdma_op {
34 u32 r_key;
35 u64 r_remote_addr;
36 unsigned int r_write:1;
37 unsigned int r_fence:1;
38 unsigned int r_notify:1;
39 unsigned int r_recverr:1;
40 unsigned int r_mapped:1;
41 struct rds_notifier *r_notifier;
42 unsigned int r_bytes;
43 unsigned int r_nents;
44 unsigned int r_count;
45 struct scatterlist r_sg[0];
46};
47
48static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
49{
50 return r_key | (((u64) offset) << 32);
51}
52
53static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
54{
55 return cookie;
56}
57
58static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
59{
60 return cookie >> 32;
61}
62
63int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
64int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
65void rds_rdma_drop_keys(struct rds_sock *rs);
66int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
67 struct cmsghdr *cmsg);
68int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
69 struct cmsghdr *cmsg);
70int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
71 struct cmsghdr *cmsg);
72int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
73 struct cmsghdr *cmsg);
74void rds_rdma_free_op(struct rds_rdma_op *ro);
75void rds_rdma_send_complete(struct rds_message *rm, int);
76
77extern void __rds_put_mr_final(struct rds_mr *mr);
78static inline void rds_mr_put(struct rds_mr *mr)
79{
80 if (atomic_dec_and_test(&mr->r_refcount))
81 __rds_put_mr_final(mr);
82}
83
84#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 000000000000..7b19024f9706
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (c) 2009 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <rdma/rdma_cm.h>
34
35#include "rdma_transport.h"
36
37static struct rdma_cm_id *rds_iw_listen_id;
38
39int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
40 struct rdma_cm_event *event)
41{
42 /* this can be null in the listening path */
43 struct rds_connection *conn = cm_id->context;
44 struct rds_transport *trans;
45 int ret = 0;
46
47 rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
48 event->event);
49
50 if (cm_id->device->node_type == RDMA_NODE_RNIC)
51 trans = &rds_iw_transport;
52 else
53 trans = &rds_ib_transport;
54
55 /* Prevent shutdown from tearing down the connection
56 * while we're executing. */
57 if (conn) {
58 mutex_lock(&conn->c_cm_lock);
59
60 /* If the connection is being shut down, bail out
61 * right away. We return 0 so cm_id doesn't get
62 * destroyed prematurely */
63 if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
64 /* Reject incoming connections while we're tearing
65 * down an existing one. */
66 if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
67 ret = 1;
68 goto out;
69 }
70 }
71
72 switch (event->event) {
73 case RDMA_CM_EVENT_CONNECT_REQUEST:
74 ret = trans->cm_handle_connect(cm_id, event);
75 break;
76
77 case RDMA_CM_EVENT_ADDR_RESOLVED:
78 /* XXX do we need to clean up if this fails? */
79 ret = rdma_resolve_route(cm_id,
80 RDS_RDMA_RESOLVE_TIMEOUT_MS);
81 break;
82
83 case RDMA_CM_EVENT_ROUTE_RESOLVED:
84 /* XXX worry about racing with listen acceptance */
85 ret = trans->cm_initiate_connect(cm_id);
86 break;
87
88 case RDMA_CM_EVENT_ESTABLISHED:
89 trans->cm_connect_complete(conn, event);
90 break;
91
92 case RDMA_CM_EVENT_ADDR_ERROR:
93 case RDMA_CM_EVENT_ROUTE_ERROR:
94 case RDMA_CM_EVENT_CONNECT_ERROR:
95 case RDMA_CM_EVENT_UNREACHABLE:
96 case RDMA_CM_EVENT_REJECTED:
97 case RDMA_CM_EVENT_DEVICE_REMOVAL:
98 case RDMA_CM_EVENT_ADDR_CHANGE:
99 if (conn)
100 rds_conn_drop(conn);
101 break;
102
103 case RDMA_CM_EVENT_DISCONNECTED:
104 printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
105 "%pI4->%pI4\n", &conn->c_laddr,
106 &conn->c_faddr);
107 rds_conn_drop(conn);
108 break;
109
110 default:
111 /* things like device disconnect? */
112 printk(KERN_ERR "unknown event %u\n", event->event);
113 BUG();
114 break;
115 }
116
117out:
118 if (conn)
119 mutex_unlock(&conn->c_cm_lock);
120
121 rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
122
123 return ret;
124}
125
126static int __init rds_rdma_listen_init(void)
127{
128 struct sockaddr_in sin;
129 struct rdma_cm_id *cm_id;
130 int ret;
131
132 cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
133 if (IS_ERR(cm_id)) {
134 ret = PTR_ERR(cm_id);
135 printk(KERN_ERR "RDS/IW: failed to setup listener, "
136 "rdma_create_id() returned %d\n", ret);
137 goto out;
138 }
139
140 sin.sin_family = PF_INET,
141 sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
142 sin.sin_port = (__force u16)htons(RDS_PORT);
143
144 /*
145 * XXX I bet this binds the cm_id to a device. If we want to support
146 * fail-over we'll have to take this into consideration.
147 */
148 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
149 if (ret) {
150 printk(KERN_ERR "RDS/IW: failed to setup listener, "
151 "rdma_bind_addr() returned %d\n", ret);
152 goto out;
153 }
154
155 ret = rdma_listen(cm_id, 128);
156 if (ret) {
157 printk(KERN_ERR "RDS/IW: failed to setup listener, "
158 "rdma_listen() returned %d\n", ret);
159 goto out;
160 }
161
162 rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
163
164 rds_iw_listen_id = cm_id;
165 cm_id = NULL;
166out:
167 if (cm_id)
168 rdma_destroy_id(cm_id);
169 return ret;
170}
171
172static void rds_rdma_listen_stop(void)
173{
174 if (rds_iw_listen_id) {
175 rdsdebug("cm %p\n", rds_iw_listen_id);
176 rdma_destroy_id(rds_iw_listen_id);
177 rds_iw_listen_id = NULL;
178 }
179}
180
181int __init rds_rdma_init(void)
182{
183 int ret;
184
185 ret = rds_rdma_listen_init();
186 if (ret)
187 goto out;
188
189 ret = rds_iw_init();
190 if (ret)
191 goto err_iw_init;
192
193 ret = rds_ib_init();
194 if (ret)
195 goto err_ib_init;
196
197 goto out;
198
199err_ib_init:
200 rds_iw_exit();
201err_iw_init:
202 rds_rdma_listen_stop();
203out:
204 return ret;
205}
206
207void rds_rdma_exit(void)
208{
209 /* stop listening first to ensure no new connections are attempted */
210 rds_rdma_listen_stop();
211 rds_ib_exit();
212 rds_iw_exit();
213}
214
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 000000000000..2f2c7d976c21
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,28 @@
1#ifndef _RDMA_TRANSPORT_H
2#define _RDMA_TRANSPORT_H
3
4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h>
6#include "rds.h"
7
8#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
9
10int rds_rdma_conn_connect(struct rds_connection *conn);
11int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
12 struct rdma_cm_event *event);
13
14/* from rdma_transport.c */
15int rds_rdma_init(void);
16void rds_rdma_exit(void);
17
18/* from ib.c */
19extern struct rds_transport rds_ib_transport;
20int rds_ib_init(void);
21void rds_ib_exit(void);
22
23/* from iw.c */
24extern struct rds_transport rds_iw_transport;
25int rds_iw_init(void);
26void rds_iw_exit(void);
27
28#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 000000000000..060400704979
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,686 @@
1#ifndef _RDS_RDS_H
2#define _RDS_RDS_H
3
4#include <net/sock.h>
5#include <linux/scatterlist.h>
6#include <linux/highmem.h>
7#include <rdma/rdma_cm.h>
8#include <linux/mutex.h>
9#include <linux/rds.h>
10
11#include "info.h"
12
13/*
14 * RDS Network protocol version
15 */
16#define RDS_PROTOCOL_3_0 0x0300
17#define RDS_PROTOCOL_3_1 0x0301
18#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
19#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
20#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
21#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
22
23/*
24 * XXX randomly chosen, but at least seems to be unused:
25 * # 18464-18768 Unassigned
26 * We should do better. We want a reserved port to discourage unpriv'ed
27 * userspace from listening.
28 */
29#define RDS_PORT 18634
30
31#ifdef DEBUG
32#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
33#else
34/* sigh, pr_debug() causes unused variable warnings */
35static inline void __attribute__ ((format (printf, 1, 2)))
36rdsdebug(char *fmt, ...)
37{
38}
39#endif
40
41/* XXX is there one of these somewhere? */
42#define ceil(x, y) \
43 ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
44
45#define RDS_FRAG_SHIFT 12
46#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
47
48#define RDS_CONG_MAP_BYTES (65536 / 8)
49#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
50#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
51#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
52
53struct rds_cong_map {
54 struct rb_node m_rb_node;
55 __be32 m_addr;
56 wait_queue_head_t m_waitq;
57 struct list_head m_conn_list;
58 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
59};
60
61
62/*
63 * This is how we will track the connection state:
64 * A connection is always in one of the following
65 * states. Updates to the state are atomic and imply
66 * a memory barrier.
67 */
68enum {
69 RDS_CONN_DOWN = 0,
70 RDS_CONN_CONNECTING,
71 RDS_CONN_DISCONNECTING,
72 RDS_CONN_UP,
73 RDS_CONN_ERROR,
74};
75
76/* Bits for c_flags */
77#define RDS_LL_SEND_FULL 0
78#define RDS_RECONNECT_PENDING 1
79
80struct rds_connection {
81 struct hlist_node c_hash_node;
82 __be32 c_laddr;
83 __be32 c_faddr;
84 unsigned int c_loopback:1;
85 struct rds_connection *c_passive;
86
87 struct rds_cong_map *c_lcong;
88 struct rds_cong_map *c_fcong;
89
90 struct mutex c_send_lock; /* protect send ring */
91 struct rds_message *c_xmit_rm;
92 unsigned long c_xmit_sg;
93 unsigned int c_xmit_hdr_off;
94 unsigned int c_xmit_data_off;
95 unsigned int c_xmit_rdma_sent;
96
97 spinlock_t c_lock; /* protect msg queues */
98 u64 c_next_tx_seq;
99 struct list_head c_send_queue;
100 struct list_head c_retrans;
101
102 u64 c_next_rx_seq;
103
104 struct rds_transport *c_trans;
105 void *c_transport_data;
106
107 atomic_t c_state;
108 unsigned long c_flags;
109 unsigned long c_reconnect_jiffies;
110 struct delayed_work c_send_w;
111 struct delayed_work c_recv_w;
112 struct delayed_work c_conn_w;
113 struct work_struct c_down_w;
114 struct mutex c_cm_lock; /* protect conn state & cm */
115
116 struct list_head c_map_item;
117 unsigned long c_map_queued;
118 unsigned long c_map_offset;
119 unsigned long c_map_bytes;
120
121 unsigned int c_unacked_packets;
122 unsigned int c_unacked_bytes;
123
124 /* Protocol version */
125 unsigned int c_version;
126};
127
128#define RDS_FLAG_CONG_BITMAP 0x01
129#define RDS_FLAG_ACK_REQUIRED 0x02
130#define RDS_FLAG_RETRANSMITTED 0x04
131#define RDS_MAX_ADV_CREDIT 127
132
133/*
134 * Maximum space available for extension headers.
135 */
136#define RDS_HEADER_EXT_SPACE 16
137
138struct rds_header {
139 __be64 h_sequence;
140 __be64 h_ack;
141 __be32 h_len;
142 __be16 h_sport;
143 __be16 h_dport;
144 u8 h_flags;
145 u8 h_credit;
146 u8 h_padding[4];
147 __sum16 h_csum;
148
149 u8 h_exthdr[RDS_HEADER_EXT_SPACE];
150};
151
152/*
153 * Reserved - indicates end of extensions
154 */
155#define RDS_EXTHDR_NONE 0
156
157/*
158 * This extension header is included in the very
159 * first message that is sent on a new connection,
160 * and identifies the protocol level. This will help
161 * rolling updates if a future change requires breaking
162 * the protocol.
163 * NB: This is no longer true for IB, where we do a version
164 * negotiation during the connection setup phase (protocol
165 * version information is included in the RDMA CM private data).
166 */
167#define RDS_EXTHDR_VERSION 1
168struct rds_ext_header_version {
169 __be32 h_version;
170};
171
172/*
173 * This extension header is included in the RDS message
174 * chasing an RDMA operation.
175 */
176#define RDS_EXTHDR_RDMA 2
177struct rds_ext_header_rdma {
178 __be32 h_rdma_rkey;
179};
180
181/*
182 * This extension header tells the peer about the
183 * destination <R_Key,offset> of the requested RDMA
184 * operation.
185 */
186#define RDS_EXTHDR_RDMA_DEST 3
187struct rds_ext_header_rdma_dest {
188 __be32 h_rdma_rkey;
189 __be32 h_rdma_offset;
190};
191
192#define __RDS_EXTHDR_MAX 16 /* for now */
193
194struct rds_incoming {
195 atomic_t i_refcount;
196 struct list_head i_item;
197 struct rds_connection *i_conn;
198 struct rds_header i_hdr;
199 unsigned long i_rx_jiffies;
200 __be32 i_saddr;
201
202 rds_rdma_cookie_t i_rdma_cookie;
203};
204
205/*
206 * m_sock_item and m_conn_item are on lists that are serialized under
207 * conn->c_lock. m_sock_item has additional meaning in that once it is empty
208 * the message will not be put back on the retransmit list after being sent.
209 * messages that are canceled while being sent rely on this.
210 *
211 * m_inc is used by loopback so that it can pass an incoming message straight
212 * back up into the rx path. It embeds a wire header which is also used by
213 * the send path, which is kind of awkward.
214 *
215 * m_sock_item indicates the message's presence on a socket's send or receive
216 * queue. m_rs will point to that socket.
217 *
218 * m_daddr is used by cancellation to prune messages to a given destination.
219 *
220 * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
221 * nesting. As paths iterate over messages on a sock, or conn, they must
222 * also lock the conn, or sock, to remove the message from those lists too.
223 * Testing the flag to determine if the message is still on the lists lets
224 * us avoid testing the list_head directly. That means each path can use
225 * the message's list_head to keep it on a local list while juggling locks
226 * without confusing the other path.
227 *
228 * m_ack_seq is an optional field set by transports who need a different
229 * sequence number range to invalidate. They can use this in a callback
230 * that they pass to rds_send_drop_acked() to see if each message has been
231 * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't
232 * had ack_seq set yet.
233 */
234#define RDS_MSG_ON_SOCK 1
235#define RDS_MSG_ON_CONN 2
236#define RDS_MSG_HAS_ACK_SEQ 3
237#define RDS_MSG_ACK_REQUIRED 4
238#define RDS_MSG_RETRANSMITTED 5
239#define RDS_MSG_MAPPED 6
240#define RDS_MSG_PAGEVEC 7
241
242struct rds_message {
243 atomic_t m_refcount;
244 struct list_head m_sock_item;
245 struct list_head m_conn_item;
246 struct rds_incoming m_inc;
247 u64 m_ack_seq;
248 __be32 m_daddr;
249 unsigned long m_flags;
250
251 /* Never access m_rs without holding m_rs_lock.
252 * Lock nesting is
253 * rm->m_rs_lock
254 * -> rs->rs_lock
255 */
256 spinlock_t m_rs_lock;
257 struct rds_sock *m_rs;
258 struct rds_rdma_op *m_rdma_op;
259 rds_rdma_cookie_t m_rdma_cookie;
260 struct rds_mr *m_rdma_mr;
261 unsigned int m_nents;
262 unsigned int m_count;
263 struct scatterlist m_sg[0];
264};
265
266/*
267 * The RDS notifier is used (optionally) to tell the application about
268 * completed RDMA operations. Rather than keeping the whole rds message
269 * around on the queue, we allocate a small notifier that is put on the
270 * socket's notifier_list. Notifications are delivered to the application
271 * through control messages.
272 */
273struct rds_notifier {
274 struct list_head n_list;
275 uint64_t n_user_token;
276 int n_status;
277};
278
279/**
280 * struct rds_transport - transport specific behavioural hooks
281 *
282 * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
283 * part of a message. The caller serializes on the send_sem so this
284 * doesn't need to be reentrant for a given conn. The header must be
285 * sent before the data payload. .xmit must be prepared to send a
286 * message with no data payload. .xmit should return the number of
287 * bytes that were sent down the connection, including header bytes.
288 * Returning 0 tells the caller that it doesn't need to perform any
289 * additional work now. This is usually the case when the transport has
290 * filled the sending queue for its connection and will handle
291 * triggering the rds thread to continue the send when space becomes
292 * available. Returning -EAGAIN tells the caller to retry the send
293 * immediately. Returning -ENOMEM tells the caller to retry the send at
294 * some point in the future.
295 *
296 * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
297 * it returns the connection can not call rds_recv_incoming().
298 * This will only be called once after conn_connect returns
299 * non-zero success and will The caller serializes this with
300 * the send and connecting paths (xmit_* and conn_*). The
301 * transport is responsible for other serialization, including
302 * rds_recv_incoming(). This is called in process context but
303 * should try hard not to block.
304 *
305 * @xmit_cong_map: This asks the transport to send the local bitmap down the
306 * given connection. XXX get a better story about the bitmap
307 * flag and header.
308 */
309
310struct rds_transport {
311 char t_name[TRANSNAMSIZ];
312 struct list_head t_item;
313 struct module *t_owner;
314 unsigned int t_prefer_loopback:1;
315
316 int (*laddr_check)(__be32 addr);
317 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
318 void (*conn_free)(void *data);
319 int (*conn_connect)(struct rds_connection *conn);
320 void (*conn_shutdown)(struct rds_connection *conn);
321 void (*xmit_prepare)(struct rds_connection *conn);
322 void (*xmit_complete)(struct rds_connection *conn);
323 int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
324 unsigned int hdr_off, unsigned int sg, unsigned int off);
325 int (*xmit_cong_map)(struct rds_connection *conn,
326 struct rds_cong_map *map, unsigned long offset);
327 int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
328 int (*recv)(struct rds_connection *conn);
329 int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
330 size_t size);
331 void (*inc_purge)(struct rds_incoming *inc);
332 void (*inc_free)(struct rds_incoming *inc);
333
334 int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
335 struct rdma_cm_event *event);
336 int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
337 void (*cm_connect_complete)(struct rds_connection *conn,
338 struct rdma_cm_event *event);
339
340 unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
341 unsigned int avail);
342 void (*exit)(void);
343 void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
344 struct rds_sock *rs, u32 *key_ret);
345 void (*sync_mr)(void *trans_private, int direction);
346 void (*free_mr)(void *trans_private, int invalidate);
347 void (*flush_mrs)(void);
348};
349
350struct rds_sock {
351 struct sock rs_sk;
352
353 u64 rs_user_addr;
354 u64 rs_user_bytes;
355
356 /*
357 * bound_addr used for both incoming and outgoing, no INADDR_ANY
358 * support.
359 */
360 struct rb_node rs_bound_node;
361 __be32 rs_bound_addr;
362 __be32 rs_conn_addr;
363 __be16 rs_bound_port;
364 __be16 rs_conn_port;
365
366 /*
367 * This is only used to communicate the transport between bind and
368 * initiating connections. All other trans use is referenced through
369 * the connection.
370 */
371 struct rds_transport *rs_transport;
372
373 /*
374 * rds_sendmsg caches the conn it used the last time around.
375 * This helps avoid costly lookups.
376 */
377 struct rds_connection *rs_conn;
378
379 /* flag indicating we were congested or not */
380 int rs_congested;
381
382 /* rs_lock protects all these adjacent members before the newline */
383 spinlock_t rs_lock;
384 struct list_head rs_send_queue;
385 u32 rs_snd_bytes;
386 int rs_rcv_bytes;
387 struct list_head rs_notify_queue; /* currently used for failed RDMAs */
388
389 /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
390 * to decide whether the application should be woken up.
391 * If not set, we use rs_cong_track to find out whether a cong map
392 * update arrived.
393 */
394 uint64_t rs_cong_mask;
395 uint64_t rs_cong_notify;
396 struct list_head rs_cong_list;
397 unsigned long rs_cong_track;
398
399 /*
400 * rs_recv_lock protects the receive queue, and is
401 * used to serialize with rds_release.
402 */
403 rwlock_t rs_recv_lock;
404 struct list_head rs_recv_queue;
405
406 /* just for stats reporting */
407 struct list_head rs_item;
408
409 /* these have their own lock */
410 spinlock_t rs_rdma_lock;
411 struct rb_root rs_rdma_keys;
412
413 /* Socket options - in case there will be more */
414 unsigned char rs_recverr,
415 rs_cong_monitor;
416};
417
418static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
419{
420 return container_of(sk, struct rds_sock, rs_sk);
421}
422static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
423{
424 return &rs->rs_sk;
425}
426
427/*
428 * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
429 * to account for overhead. We don't account for overhead, we just apply
430 * the number of payload bytes to the specified value.
431 */
432static inline int rds_sk_sndbuf(struct rds_sock *rs)
433{
434 return rds_rs_to_sk(rs)->sk_sndbuf / 2;
435}
436static inline int rds_sk_rcvbuf(struct rds_sock *rs)
437{
438 return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
439}
440
441struct rds_statistics {
442 uint64_t s_conn_reset;
443 uint64_t s_recv_drop_bad_checksum;
444 uint64_t s_recv_drop_old_seq;
445 uint64_t s_recv_drop_no_sock;
446 uint64_t s_recv_drop_dead_sock;
447 uint64_t s_recv_deliver_raced;
448 uint64_t s_recv_delivered;
449 uint64_t s_recv_queued;
450 uint64_t s_recv_immediate_retry;
451 uint64_t s_recv_delayed_retry;
452 uint64_t s_recv_ack_required;
453 uint64_t s_recv_rdma_bytes;
454 uint64_t s_recv_ping;
455 uint64_t s_send_queue_empty;
456 uint64_t s_send_queue_full;
457 uint64_t s_send_sem_contention;
458 uint64_t s_send_sem_queue_raced;
459 uint64_t s_send_immediate_retry;
460 uint64_t s_send_delayed_retry;
461 uint64_t s_send_drop_acked;
462 uint64_t s_send_ack_required;
463 uint64_t s_send_queued;
464 uint64_t s_send_rdma;
465 uint64_t s_send_rdma_bytes;
466 uint64_t s_send_pong;
467 uint64_t s_page_remainder_hit;
468 uint64_t s_page_remainder_miss;
469 uint64_t s_copy_to_user;
470 uint64_t s_copy_from_user;
471 uint64_t s_cong_update_queued;
472 uint64_t s_cong_update_received;
473 uint64_t s_cong_send_error;
474 uint64_t s_cong_send_blocked;
475};
476
477/* af_rds.c */
478void rds_sock_addref(struct rds_sock *rs);
479void rds_sock_put(struct rds_sock *rs);
480void rds_wake_sk_sleep(struct rds_sock *rs);
481static inline void __rds_wake_sk_sleep(struct sock *sk)
482{
483 wait_queue_head_t *waitq = sk->sk_sleep;
484
485 if (!sock_flag(sk, SOCK_DEAD) && waitq)
486 wake_up(waitq);
487}
488extern wait_queue_head_t rds_poll_waitq;
489
490
491/* bind.c */
492int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
493void rds_remove_bound(struct rds_sock *rs);
494struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
495
496/* cong.c */
497int rds_cong_get_maps(struct rds_connection *conn);
498void rds_cong_add_conn(struct rds_connection *conn);
499void rds_cong_remove_conn(struct rds_connection *conn);
500void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
501void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
502int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
503void rds_cong_queue_updates(struct rds_cong_map *map);
504void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
505int rds_cong_updated_since(unsigned long *recent);
506void rds_cong_add_socket(struct rds_sock *);
507void rds_cong_remove_socket(struct rds_sock *);
508void rds_cong_exit(void);
509struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
510
511/* conn.c */
512int __init rds_conn_init(void);
513void rds_conn_exit(void);
514struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
515 struct rds_transport *trans, gfp_t gfp);
516struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
517 struct rds_transport *trans, gfp_t gfp);
518void rds_conn_destroy(struct rds_connection *conn);
519void rds_conn_reset(struct rds_connection *conn);
520void rds_conn_drop(struct rds_connection *conn);
521void rds_for_each_conn_info(struct socket *sock, unsigned int len,
522 struct rds_info_iterator *iter,
523 struct rds_info_lengths *lens,
524 int (*visitor)(struct rds_connection *, void *),
525 size_t item_len);
526void __rds_conn_error(struct rds_connection *conn, const char *, ...)
527 __attribute__ ((format (printf, 2, 3)));
528#define rds_conn_error(conn, fmt...) \
529 __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
530
531static inline int
532rds_conn_transition(struct rds_connection *conn, int old, int new)
533{
534 return atomic_cmpxchg(&conn->c_state, old, new) == old;
535}
536
537static inline int
538rds_conn_state(struct rds_connection *conn)
539{
540 return atomic_read(&conn->c_state);
541}
542
543static inline int
544rds_conn_up(struct rds_connection *conn)
545{
546 return atomic_read(&conn->c_state) == RDS_CONN_UP;
547}
548
549static inline int
550rds_conn_connecting(struct rds_connection *conn)
551{
552 return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
553}
554
555/* message.c */
556struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
557struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
558 size_t total_len);
559struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
560void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
561 __be16 dport, u64 seq);
562int rds_message_add_extension(struct rds_header *hdr,
563 unsigned int type, const void *data, unsigned int len);
564int rds_message_next_extension(struct rds_header *hdr,
565 unsigned int *pos, void *buf, unsigned int *buflen);
566int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
567int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
568int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
569int rds_message_inc_copy_to_user(struct rds_incoming *inc,
570 struct iovec *first_iov, size_t size);
571void rds_message_inc_purge(struct rds_incoming *inc);
572void rds_message_inc_free(struct rds_incoming *inc);
573void rds_message_addref(struct rds_message *rm);
574void rds_message_put(struct rds_message *rm);
575void rds_message_wait(struct rds_message *rm);
576void rds_message_unmapped(struct rds_message *rm);
577
578static inline void rds_message_make_checksum(struct rds_header *hdr)
579{
580 hdr->h_csum = 0;
581 hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
582}
583
584static inline int rds_message_verify_checksum(const struct rds_header *hdr)
585{
586 return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
587}
588
589
590/* page.c */
591int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
592 gfp_t gfp);
593int rds_page_copy_user(struct page *page, unsigned long offset,
594 void __user *ptr, unsigned long bytes,
595 int to_user);
596#define rds_page_copy_to_user(page, offset, ptr, bytes) \
597 rds_page_copy_user(page, offset, ptr, bytes, 1)
598#define rds_page_copy_from_user(page, offset, ptr, bytes) \
599 rds_page_copy_user(page, offset, ptr, bytes, 0)
600void rds_page_exit(void);
601
602/* recv.c */
603void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
604 __be32 saddr);
605void rds_inc_addref(struct rds_incoming *inc);
606void rds_inc_put(struct rds_incoming *inc);
607void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
608 struct rds_incoming *inc, gfp_t gfp, enum km_type km);
609int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
610 size_t size, int msg_flags);
611void rds_clear_recv_queue(struct rds_sock *rs);
612int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
613void rds_inc_info_copy(struct rds_incoming *inc,
614 struct rds_info_iterator *iter,
615 __be32 saddr, __be32 daddr, int flip);
616
617/* send.c */
618int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
619 size_t payload_len);
620void rds_send_reset(struct rds_connection *conn);
621int rds_send_xmit(struct rds_connection *conn);
622struct sockaddr_in;
623void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
624typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
625void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
626 is_acked_func is_acked);
627int rds_send_acked_before(struct rds_connection *conn, u64 seq);
628void rds_send_remove_from_sock(struct list_head *messages, int status);
629int rds_send_pong(struct rds_connection *conn, __be16 dport);
630struct rds_message *rds_send_get_message(struct rds_connection *,
631 struct rds_rdma_op *);
632
633/* rdma.c */
634void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
635
636/* stats.c */
637DECLARE_PER_CPU(struct rds_statistics, rds_stats);
638#define rds_stats_inc_which(which, member) do { \
639 per_cpu(which, get_cpu()).member++; \
640 put_cpu(); \
641} while (0)
642#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
643#define rds_stats_add_which(which, member, count) do { \
644 per_cpu(which, get_cpu()).member += count; \
645 put_cpu(); \
646} while (0)
647#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
648int __init rds_stats_init(void);
649void rds_stats_exit(void);
650void rds_stats_info_copy(struct rds_info_iterator *iter,
651 uint64_t *values, char **names, size_t nr);
652
653/* sysctl.c */
654int __init rds_sysctl_init(void);
655void rds_sysctl_exit(void);
656extern unsigned long rds_sysctl_sndbuf_min;
657extern unsigned long rds_sysctl_sndbuf_default;
658extern unsigned long rds_sysctl_sndbuf_max;
659extern unsigned long rds_sysctl_reconnect_min_jiffies;
660extern unsigned long rds_sysctl_reconnect_max_jiffies;
661extern unsigned int rds_sysctl_max_unacked_packets;
662extern unsigned int rds_sysctl_max_unacked_bytes;
663extern unsigned int rds_sysctl_ping_enable;
664extern unsigned long rds_sysctl_trace_flags;
665extern unsigned int rds_sysctl_trace_level;
666
667/* threads.c */
668int __init rds_threads_init(void);
669void rds_threads_exit(void);
670extern struct workqueue_struct *rds_wq;
671void rds_connect_worker(struct work_struct *);
672void rds_shutdown_worker(struct work_struct *);
673void rds_send_worker(struct work_struct *);
674void rds_recv_worker(struct work_struct *);
675void rds_connect_complete(struct rds_connection *conn);
676
677/* transport.c */
678int rds_trans_register(struct rds_transport *trans);
679void rds_trans_unregister(struct rds_transport *trans);
680struct rds_transport *rds_trans_get_preferred(__be32 addr);
681unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
682 unsigned int avail);
683int __init rds_trans_init(void);
684void rds_trans_exit(void);
685
686#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 000000000000..f2118c51cfa3
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,542 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <net/sock.h>
35#include <linux/in.h>
36
37#include "rds.h"
38#include "rdma.h"
39
40void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
41 __be32 saddr)
42{
43 atomic_set(&inc->i_refcount, 1);
44 INIT_LIST_HEAD(&inc->i_item);
45 inc->i_conn = conn;
46 inc->i_saddr = saddr;
47 inc->i_rdma_cookie = 0;
48}
49
50void rds_inc_addref(struct rds_incoming *inc)
51{
52 rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
53 atomic_inc(&inc->i_refcount);
54}
55
56void rds_inc_put(struct rds_incoming *inc)
57{
58 rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
59 if (atomic_dec_and_test(&inc->i_refcount)) {
60 BUG_ON(!list_empty(&inc->i_item));
61
62 inc->i_conn->c_trans->inc_free(inc);
63 }
64}
65
66static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
67 struct rds_cong_map *map,
68 int delta, __be16 port)
69{
70 int now_congested;
71
72 if (delta == 0)
73 return;
74
75 rs->rs_rcv_bytes += delta;
76 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
77
78 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
79 "now_cong %d delta %d\n",
80 rs, &rs->rs_bound_addr,
81 ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
82 rds_sk_rcvbuf(rs), now_congested, delta);
83
84 /* wasn't -> am congested */
85 if (!rs->rs_congested && now_congested) {
86 rs->rs_congested = 1;
87 rds_cong_set_bit(map, port);
88 rds_cong_queue_updates(map);
89 }
90 /* was -> aren't congested */
91 /* Require more free space before reporting uncongested to prevent
92 bouncing cong/uncong state too often */
93 else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
94 rs->rs_congested = 0;
95 rds_cong_clear_bit(map, port);
96 rds_cong_queue_updates(map);
97 }
98
99 /* do nothing if no change in cong state */
100}
101
102/*
103 * Process all extension headers that come with this message.
104 */
105static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
106{
107 struct rds_header *hdr = &inc->i_hdr;
108 unsigned int pos = 0, type, len;
109 union {
110 struct rds_ext_header_version version;
111 struct rds_ext_header_rdma rdma;
112 struct rds_ext_header_rdma_dest rdma_dest;
113 } buffer;
114
115 while (1) {
116 len = sizeof(buffer);
117 type = rds_message_next_extension(hdr, &pos, &buffer, &len);
118 if (type == RDS_EXTHDR_NONE)
119 break;
120 /* Process extension header here */
121 switch (type) {
122 case RDS_EXTHDR_RDMA:
123 rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
124 break;
125
126 case RDS_EXTHDR_RDMA_DEST:
127 /* We ignore the size for now. We could stash it
128 * somewhere and use it for error checking. */
129 inc->i_rdma_cookie = rds_rdma_make_cookie(
130 be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
131 be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
132
133 break;
134 }
135 }
136}
137
138/*
139 * The transport must make sure that this is serialized against other
140 * rx and conn reset on this specific conn.
141 *
142 * We currently assert that only one fragmented message will be sent
143 * down a connection at a time. This lets us reassemble in the conn
144 * instead of per-flow which means that we don't have to go digging through
145 * flows to tear down partial reassembly progress on conn failure and
146 * we save flow lookup and locking for each frag arrival. It does mean
147 * that small messages will wait behind large ones. Fragmenting at all
148 * is only to reduce the memory consumption of pre-posted buffers.
149 *
150 * The caller passes in saddr and daddr instead of us getting it from the
151 * conn. This lets loopback, who only has one conn for both directions,
152 * tell us which roles the addrs in the conn are playing for this message.
153 */
154void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
155 struct rds_incoming *inc, gfp_t gfp, enum km_type km)
156{
157 struct rds_sock *rs = NULL;
158 struct sock *sk;
159 unsigned long flags;
160
161 inc->i_conn = conn;
162 inc->i_rx_jiffies = jiffies;
163
164 rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
165 "flags 0x%x rx_jiffies %lu\n", conn,
166 (unsigned long long)conn->c_next_rx_seq,
167 inc,
168 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
169 be32_to_cpu(inc->i_hdr.h_len),
170 be16_to_cpu(inc->i_hdr.h_sport),
171 be16_to_cpu(inc->i_hdr.h_dport),
172 inc->i_hdr.h_flags,
173 inc->i_rx_jiffies);
174
175 /*
176 * Sequence numbers should only increase. Messages get their
177 * sequence number as they're queued in a sending conn. They
178 * can be dropped, though, if the sending socket is closed before
179 * they hit the wire. So sequence numbers can skip forward
180 * under normal operation. They can also drop back in the conn
181 * failover case as previously sent messages are resent down the
182 * new instance of a conn. We drop those, otherwise we have
183 * to assume that the next valid seq does not come after a
184 * hole in the fragment stream.
185 *
186 * The headers don't give us a way to realize if fragments of
187 * a message have been dropped. We assume that frags that arrive
188 * to a flow are part of the current message on the flow that is
189 * being reassembled. This means that senders can't drop messages
190 * from the sending conn until all their frags are sent.
191 *
192 * XXX we could spend more on the wire to get more robust failure
193 * detection, arguably worth it to avoid data corruption.
194 */
195 if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
196 && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
197 rds_stats_inc(s_recv_drop_old_seq);
198 goto out;
199 }
200 conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
201
202 if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
203 rds_stats_inc(s_recv_ping);
204 rds_send_pong(conn, inc->i_hdr.h_sport);
205 goto out;
206 }
207
208 rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
209 if (rs == NULL) {
210 rds_stats_inc(s_recv_drop_no_sock);
211 goto out;
212 }
213
214 /* Process extension headers */
215 rds_recv_incoming_exthdrs(inc, rs);
216
217 /* We can be racing with rds_release() which marks the socket dead. */
218 sk = rds_rs_to_sk(rs);
219
220 /* serialize with rds_release -> sock_orphan */
221 write_lock_irqsave(&rs->rs_recv_lock, flags);
222 if (!sock_flag(sk, SOCK_DEAD)) {
223 rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
224 rds_stats_inc(s_recv_queued);
225 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
226 be32_to_cpu(inc->i_hdr.h_len),
227 inc->i_hdr.h_dport);
228 rds_inc_addref(inc);
229 list_add_tail(&inc->i_item, &rs->rs_recv_queue);
230 __rds_wake_sk_sleep(sk);
231 } else {
232 rds_stats_inc(s_recv_drop_dead_sock);
233 }
234 write_unlock_irqrestore(&rs->rs_recv_lock, flags);
235
236out:
237 if (rs)
238 rds_sock_put(rs);
239}
240
241/*
242 * be very careful here. This is being called as the condition in
243 * wait_event_*() needs to cope with being called many times.
244 */
245static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
246{
247 unsigned long flags;
248
249 if (*inc == NULL) {
250 read_lock_irqsave(&rs->rs_recv_lock, flags);
251 if (!list_empty(&rs->rs_recv_queue)) {
252 *inc = list_entry(rs->rs_recv_queue.next,
253 struct rds_incoming,
254 i_item);
255 rds_inc_addref(*inc);
256 }
257 read_unlock_irqrestore(&rs->rs_recv_lock, flags);
258 }
259
260 return *inc != NULL;
261}
262
263static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
264 int drop)
265{
266 struct sock *sk = rds_rs_to_sk(rs);
267 int ret = 0;
268 unsigned long flags;
269
270 write_lock_irqsave(&rs->rs_recv_lock, flags);
271 if (!list_empty(&inc->i_item)) {
272 ret = 1;
273 if (drop) {
274 /* XXX make sure this i_conn is reliable */
275 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
276 -be32_to_cpu(inc->i_hdr.h_len),
277 inc->i_hdr.h_dport);
278 list_del_init(&inc->i_item);
279 rds_inc_put(inc);
280 }
281 }
282 write_unlock_irqrestore(&rs->rs_recv_lock, flags);
283
284 rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
285 return ret;
286}
287
288/*
289 * Pull errors off the error queue.
290 * If msghdr is NULL, we will just purge the error queue.
291 */
292int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
293{
294 struct rds_notifier *notifier;
295 struct rds_rdma_notify cmsg;
296 unsigned int count = 0, max_messages = ~0U;
297 unsigned long flags;
298 LIST_HEAD(copy);
299 int err = 0;
300
301
302 /* put_cmsg copies to user space and thus may sleep. We can't do this
303 * with rs_lock held, so first grab as many notifications as we can stuff
304 * in the user provided cmsg buffer. We don't try to copy more, to avoid
305 * losing notifications - except when the buffer is so small that it wouldn't
306 * even hold a single notification. Then we give him as much of this single
307 * msg as we can squeeze in, and set MSG_CTRUNC.
308 */
309 if (msghdr) {
310 max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
311 if (!max_messages)
312 max_messages = 1;
313 }
314
315 spin_lock_irqsave(&rs->rs_lock, flags);
316 while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
317 notifier = list_entry(rs->rs_notify_queue.next,
318 struct rds_notifier, n_list);
319 list_move(&notifier->n_list, &copy);
320 count++;
321 }
322 spin_unlock_irqrestore(&rs->rs_lock, flags);
323
324 if (!count)
325 return 0;
326
327 while (!list_empty(&copy)) {
328 notifier = list_entry(copy.next, struct rds_notifier, n_list);
329
330 if (msghdr) {
331 cmsg.user_token = notifier->n_user_token;
332 cmsg.status = notifier->n_status;
333
334 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
335 sizeof(cmsg), &cmsg);
336 if (err)
337 break;
338 }
339
340 list_del_init(&notifier->n_list);
341 kfree(notifier);
342 }
343
344 /* If we bailed out because of an error in put_cmsg,
345 * we may be left with one or more notifications that we
346 * didn't process. Return them to the head of the list. */
347 if (!list_empty(&copy)) {
348 spin_lock_irqsave(&rs->rs_lock, flags);
349 list_splice(&copy, &rs->rs_notify_queue);
350 spin_unlock_irqrestore(&rs->rs_lock, flags);
351 }
352
353 return err;
354}
355
356/*
357 * Queue a congestion notification
358 */
359static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
360{
361 uint64_t notify = rs->rs_cong_notify;
362 unsigned long flags;
363 int err;
364
365 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
366 sizeof(notify), &notify);
367 if (err)
368 return err;
369
370 spin_lock_irqsave(&rs->rs_lock, flags);
371 rs->rs_cong_notify &= ~notify;
372 spin_unlock_irqrestore(&rs->rs_lock, flags);
373
374 return 0;
375}
376
377/*
378 * Receive any control messages.
379 */
380static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
381{
382 int ret = 0;
383
384 if (inc->i_rdma_cookie) {
385 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
386 sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
387 if (ret)
388 return ret;
389 }
390
391 return 0;
392}
393
394int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
395 size_t size, int msg_flags)
396{
397 struct sock *sk = sock->sk;
398 struct rds_sock *rs = rds_sk_to_rs(sk);
399 long timeo;
400 int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
401 struct sockaddr_in *sin;
402 struct rds_incoming *inc = NULL;
403
404 /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
405 timeo = sock_rcvtimeo(sk, nonblock);
406
407 rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
408
409 if (msg_flags & MSG_OOB)
410 goto out;
411
412 /* If there are pending notifications, do those - and nothing else */
413 if (!list_empty(&rs->rs_notify_queue)) {
414 ret = rds_notify_queue_get(rs, msg);
415 goto out;
416 }
417
418 if (rs->rs_cong_notify) {
419 ret = rds_notify_cong(rs, msg);
420 goto out;
421 }
422
423 while (1) {
424 if (!rds_next_incoming(rs, &inc)) {
425 if (nonblock) {
426 ret = -EAGAIN;
427 break;
428 }
429
430 timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
431 rds_next_incoming(rs, &inc),
432 timeo);
433 rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
434 timeo);
435 if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
436 continue;
437
438 ret = timeo;
439 if (ret == 0)
440 ret = -ETIMEDOUT;
441 break;
442 }
443
444 rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
445 &inc->i_conn->c_faddr,
446 ntohs(inc->i_hdr.h_sport));
447 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
448 size);
449 if (ret < 0)
450 break;
451
452 /*
453 * if the message we just copied isn't at the head of the
454 * recv queue then someone else raced us to return it, try
455 * to get the next message.
456 */
457 if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
458 rds_inc_put(inc);
459 inc = NULL;
460 rds_stats_inc(s_recv_deliver_raced);
461 continue;
462 }
463
464 if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
465 if (msg_flags & MSG_TRUNC)
466 ret = be32_to_cpu(inc->i_hdr.h_len);
467 msg->msg_flags |= MSG_TRUNC;
468 }
469
470 if (rds_cmsg_recv(inc, msg)) {
471 ret = -EFAULT;
472 goto out;
473 }
474
475 rds_stats_inc(s_recv_delivered);
476
477 sin = (struct sockaddr_in *)msg->msg_name;
478 if (sin) {
479 sin->sin_family = AF_INET;
480 sin->sin_port = inc->i_hdr.h_sport;
481 sin->sin_addr.s_addr = inc->i_saddr;
482 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
483 }
484 break;
485 }
486
487 if (inc)
488 rds_inc_put(inc);
489
490out:
491 return ret;
492}
493
494/*
495 * The socket is being shut down and we're asked to drop messages that were
496 * queued for recvmsg. The caller has unbound the socket so the receive path
497 * won't queue any more incoming fragments or messages on the socket.
498 */
499void rds_clear_recv_queue(struct rds_sock *rs)
500{
501 struct sock *sk = rds_rs_to_sk(rs);
502 struct rds_incoming *inc, *tmp;
503 unsigned long flags;
504
505 write_lock_irqsave(&rs->rs_recv_lock, flags);
506 list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
507 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
508 -be32_to_cpu(inc->i_hdr.h_len),
509 inc->i_hdr.h_dport);
510 list_del_init(&inc->i_item);
511 rds_inc_put(inc);
512 }
513 write_unlock_irqrestore(&rs->rs_recv_lock, flags);
514}
515
516/*
517 * inc->i_saddr isn't used here because it is only set in the receive
518 * path.
519 */
520void rds_inc_info_copy(struct rds_incoming *inc,
521 struct rds_info_iterator *iter,
522 __be32 saddr, __be32 daddr, int flip)
523{
524 struct rds_info_message minfo;
525
526 minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
527 minfo.len = be32_to_cpu(inc->i_hdr.h_len);
528
529 if (flip) {
530 minfo.laddr = daddr;
531 minfo.faddr = saddr;
532 minfo.lport = inc->i_hdr.h_dport;
533 minfo.fport = inc->i_hdr.h_sport;
534 } else {
535 minfo.laddr = saddr;
536 minfo.faddr = daddr;
537 minfo.lport = inc->i_hdr.h_sport;
538 minfo.fport = inc->i_hdr.h_dport;
539 }
540
541 rds_info_copy(iter, &minfo, sizeof(minfo));
542}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 000000000000..1b37364656f0
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1003 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <net/sock.h>
35#include <linux/in.h>
36#include <linux/list.h>
37
38#include "rds.h"
39#include "rdma.h"
40
41/* When transmitting messages in rds_send_xmit, we need to emerge from
42 * time to time and briefly release the CPU. Otherwise the softlock watchdog
43 * will kick our shin.
44 * Also, it seems fairer to not let one busy connection stall all the
45 * others.
46 *
47 * send_batch_count is the number of times we'll loop in send_xmit. Setting
48 * it to 0 will restore the old behavior (where we looped until we had
49 * drained the queue).
50 */
51static int send_batch_count = 64;
52module_param(send_batch_count, int, 0444);
53MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
54
55/*
56 * Reset the send state. Caller must hold c_send_lock when calling here.
57 */
58void rds_send_reset(struct rds_connection *conn)
59{
60 struct rds_message *rm, *tmp;
61 unsigned long flags;
62
63 if (conn->c_xmit_rm) {
64 /* Tell the user the RDMA op is no longer mapped by the
65 * transport. This isn't entirely true (it's flushed out
66 * independently) but as the connection is down, there's
67 * no ongoing RDMA to/from that memory */
68 rds_message_unmapped(conn->c_xmit_rm);
69 rds_message_put(conn->c_xmit_rm);
70 conn->c_xmit_rm = NULL;
71 }
72 conn->c_xmit_sg = 0;
73 conn->c_xmit_hdr_off = 0;
74 conn->c_xmit_data_off = 0;
75 conn->c_xmit_rdma_sent = 0;
76
77 conn->c_map_queued = 0;
78
79 conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
80 conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
81
82 /* Mark messages as retransmissions, and move them to the send q */
83 spin_lock_irqsave(&conn->c_lock, flags);
84 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
85 set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
86 set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
87 }
88 list_splice_init(&conn->c_retrans, &conn->c_send_queue);
89 spin_unlock_irqrestore(&conn->c_lock, flags);
90}
91
92/*
93 * We're making the concious trade-off here to only send one message
94 * down the connection at a time.
95 * Pro:
96 * - tx queueing is a simple fifo list
97 * - reassembly is optional and easily done by transports per conn
98 * - no per flow rx lookup at all, straight to the socket
99 * - less per-frag memory and wire overhead
100 * Con:
101 * - queued acks can be delayed behind large messages
102 * Depends:
103 * - small message latency is higher behind queued large messages
104 * - large message latency isn't starved by intervening small sends
105 */
106int rds_send_xmit(struct rds_connection *conn)
107{
108 struct rds_message *rm;
109 unsigned long flags;
110 unsigned int tmp;
111 unsigned int send_quota = send_batch_count;
112 struct scatterlist *sg;
113 int ret = 0;
114 int was_empty = 0;
115 LIST_HEAD(to_be_dropped);
116
117 /*
118 * sendmsg calls here after having queued its message on the send
119 * queue. We only have one task feeding the connection at a time. If
120 * another thread is already feeding the queue then we back off. This
121 * avoids blocking the caller and trading per-connection data between
122 * caches per message.
123 *
124 * The sem holder will issue a retry if they notice that someone queued
125 * a message after they stopped walking the send queue but before they
126 * dropped the sem.
127 */
128 if (!mutex_trylock(&conn->c_send_lock)) {
129 rds_stats_inc(s_send_sem_contention);
130 ret = -ENOMEM;
131 goto out;
132 }
133
134 if (conn->c_trans->xmit_prepare)
135 conn->c_trans->xmit_prepare(conn);
136
137 /*
138 * spin trying to push headers and data down the connection until
139 * the connection doens't make forward progress.
140 */
141 while (--send_quota) {
142 /*
143 * See if need to send a congestion map update if we're
144 * between sending messages. The send_sem protects our sole
145 * use of c_map_offset and _bytes.
146 * Note this is used only by transports that define a special
147 * xmit_cong_map function. For all others, we create allocate
148 * a cong_map message and treat it just like any other send.
149 */
150 if (conn->c_map_bytes) {
151 ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
152 conn->c_map_offset);
153 if (ret <= 0)
154 break;
155
156 conn->c_map_offset += ret;
157 conn->c_map_bytes -= ret;
158 if (conn->c_map_bytes)
159 continue;
160 }
161
162 /* If we're done sending the current message, clear the
163 * offset and S/G temporaries.
164 */
165 rm = conn->c_xmit_rm;
166 if (rm != NULL &&
167 conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
168 conn->c_xmit_sg == rm->m_nents) {
169 conn->c_xmit_rm = NULL;
170 conn->c_xmit_sg = 0;
171 conn->c_xmit_hdr_off = 0;
172 conn->c_xmit_data_off = 0;
173 conn->c_xmit_rdma_sent = 0;
174
175 /* Release the reference to the previous message. */
176 rds_message_put(rm);
177 rm = NULL;
178 }
179
180 /* If we're asked to send a cong map update, do so.
181 */
182 if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
183 if (conn->c_trans->xmit_cong_map != NULL) {
184 conn->c_map_offset = 0;
185 conn->c_map_bytes = sizeof(struct rds_header) +
186 RDS_CONG_MAP_BYTES;
187 continue;
188 }
189
190 rm = rds_cong_update_alloc(conn);
191 if (IS_ERR(rm)) {
192 ret = PTR_ERR(rm);
193 break;
194 }
195
196 conn->c_xmit_rm = rm;
197 }
198
199 /*
200 * Grab the next message from the send queue, if there is one.
201 *
202 * c_xmit_rm holds a ref while we're sending this message down
203 * the connction. We can use this ref while holding the
204 * send_sem.. rds_send_reset() is serialized with it.
205 */
206 if (rm == NULL) {
207 unsigned int len;
208
209 spin_lock_irqsave(&conn->c_lock, flags);
210
211 if (!list_empty(&conn->c_send_queue)) {
212 rm = list_entry(conn->c_send_queue.next,
213 struct rds_message,
214 m_conn_item);
215 rds_message_addref(rm);
216
217 /*
218 * Move the message from the send queue to the retransmit
219 * list right away.
220 */
221 list_move_tail(&rm->m_conn_item, &conn->c_retrans);
222 }
223
224 spin_unlock_irqrestore(&conn->c_lock, flags);
225
226 if (rm == NULL) {
227 was_empty = 1;
228 break;
229 }
230
231 /* Unfortunately, the way Infiniband deals with
232 * RDMA to a bad MR key is by moving the entire
233 * queue pair to error state. We cold possibly
234 * recover from that, but right now we drop the
235 * connection.
236 * Therefore, we never retransmit messages with RDMA ops.
237 */
238 if (rm->m_rdma_op
239 && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
240 spin_lock_irqsave(&conn->c_lock, flags);
241 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
242 list_move(&rm->m_conn_item, &to_be_dropped);
243 spin_unlock_irqrestore(&conn->c_lock, flags);
244 rds_message_put(rm);
245 continue;
246 }
247
248 /* Require an ACK every once in a while */
249 len = ntohl(rm->m_inc.i_hdr.h_len);
250 if (conn->c_unacked_packets == 0
251 || conn->c_unacked_bytes < len) {
252 __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
253
254 conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
255 conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
256 rds_stats_inc(s_send_ack_required);
257 } else {
258 conn->c_unacked_bytes -= len;
259 conn->c_unacked_packets--;
260 }
261
262 conn->c_xmit_rm = rm;
263 }
264
265 /*
266 * Try and send an rdma message. Let's see if we can
267 * keep this simple and require that the transport either
268 * send the whole rdma or none of it.
269 */
270 if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
271 ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
272 if (ret)
273 break;
274 conn->c_xmit_rdma_sent = 1;
275 /* The transport owns the mapped memory for now.
276 * You can't unmap it while it's on the send queue */
277 set_bit(RDS_MSG_MAPPED, &rm->m_flags);
278 }
279
280 if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
281 conn->c_xmit_sg < rm->m_nents) {
282 ret = conn->c_trans->xmit(conn, rm,
283 conn->c_xmit_hdr_off,
284 conn->c_xmit_sg,
285 conn->c_xmit_data_off);
286 if (ret <= 0)
287 break;
288
289 if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
290 tmp = min_t(int, ret,
291 sizeof(struct rds_header) -
292 conn->c_xmit_hdr_off);
293 conn->c_xmit_hdr_off += tmp;
294 ret -= tmp;
295 }
296
297 sg = &rm->m_sg[conn->c_xmit_sg];
298 while (ret) {
299 tmp = min_t(int, ret, sg->length -
300 conn->c_xmit_data_off);
301 conn->c_xmit_data_off += tmp;
302 ret -= tmp;
303 if (conn->c_xmit_data_off == sg->length) {
304 conn->c_xmit_data_off = 0;
305 sg++;
306 conn->c_xmit_sg++;
307 BUG_ON(ret != 0 &&
308 conn->c_xmit_sg == rm->m_nents);
309 }
310 }
311 }
312 }
313
314 /* Nuke any messages we decided not to retransmit. */
315 if (!list_empty(&to_be_dropped))
316 rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
317
318 if (conn->c_trans->xmit_complete)
319 conn->c_trans->xmit_complete(conn);
320
321 /*
322 * We might be racing with another sender who queued a message but
323 * backed off on noticing that we held the c_send_lock. If we check
324 * for queued messages after dropping the sem then either we'll
325 * see the queued message or the queuer will get the sem. If we
326 * notice the queued message then we trigger an immediate retry.
327 *
328 * We need to be careful only to do this when we stopped processing
329 * the send queue because it was empty. It's the only way we
330 * stop processing the loop when the transport hasn't taken
331 * responsibility for forward progress.
332 */
333 mutex_unlock(&conn->c_send_lock);
334
335 if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
336 /* We exhausted the send quota, but there's work left to
337 * do. Return and (re-)schedule the send worker.
338 */
339 ret = -EAGAIN;
340 }
341
342 if (ret == 0 && was_empty) {
343 /* A simple bit test would be way faster than taking the
344 * spin lock */
345 spin_lock_irqsave(&conn->c_lock, flags);
346 if (!list_empty(&conn->c_send_queue)) {
347 rds_stats_inc(s_send_sem_queue_raced);
348 ret = -EAGAIN;
349 }
350 spin_unlock_irqrestore(&conn->c_lock, flags);
351 }
352out:
353 return ret;
354}
355
356static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
357{
358 u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
359
360 assert_spin_locked(&rs->rs_lock);
361
362 BUG_ON(rs->rs_snd_bytes < len);
363 rs->rs_snd_bytes -= len;
364
365 if (rs->rs_snd_bytes == 0)
366 rds_stats_inc(s_send_queue_empty);
367}
368
369static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
370 is_acked_func is_acked)
371{
372 if (is_acked)
373 return is_acked(rm, ack);
374 return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
375}
376
377/*
378 * Returns true if there are no messages on the send and retransmit queues
379 * which have a sequence number greater than or equal to the given sequence
380 * number.
381 */
382int rds_send_acked_before(struct rds_connection *conn, u64 seq)
383{
384 struct rds_message *rm, *tmp;
385 int ret = 1;
386
387 spin_lock(&conn->c_lock);
388
389 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
390 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
391 ret = 0;
392 break;
393 }
394
395 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
396 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
397 ret = 0;
398 break;
399 }
400
401 spin_unlock(&conn->c_lock);
402
403 return ret;
404}
405
406/*
407 * This is pretty similar to what happens below in the ACK
408 * handling code - except that we call here as soon as we get
409 * the IB send completion on the RDMA op and the accompanying
410 * message.
411 */
412void rds_rdma_send_complete(struct rds_message *rm, int status)
413{
414 struct rds_sock *rs = NULL;
415 struct rds_rdma_op *ro;
416 struct rds_notifier *notifier;
417
418 spin_lock(&rm->m_rs_lock);
419
420 ro = rm->m_rdma_op;
421 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
422 && ro && ro->r_notify && ro->r_notifier) {
423 notifier = ro->r_notifier;
424 rs = rm->m_rs;
425 sock_hold(rds_rs_to_sk(rs));
426
427 notifier->n_status = status;
428 spin_lock(&rs->rs_lock);
429 list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
430 spin_unlock(&rs->rs_lock);
431
432 ro->r_notifier = NULL;
433 }
434
435 spin_unlock(&rm->m_rs_lock);
436
437 if (rs) {
438 rds_wake_sk_sleep(rs);
439 sock_put(rds_rs_to_sk(rs));
440 }
441}
442
443/*
444 * This is the same as rds_rdma_send_complete except we
445 * don't do any locking - we have all the ingredients (message,
446 * socket, socket lock) and can just move the notifier.
447 */
448static inline void
449__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
450{
451 struct rds_rdma_op *ro;
452
453 ro = rm->m_rdma_op;
454 if (ro && ro->r_notify && ro->r_notifier) {
455 ro->r_notifier->n_status = status;
456 list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
457 ro->r_notifier = NULL;
458 }
459
460 /* No need to wake the app - caller does this */
461}
462
463/*
464 * This is called from the IB send completion when we detect
465 * a RDMA operation that failed with remote access error.
466 * So speed is not an issue here.
467 */
468struct rds_message *rds_send_get_message(struct rds_connection *conn,
469 struct rds_rdma_op *op)
470{
471 struct rds_message *rm, *tmp, *found = NULL;
472 unsigned long flags;
473
474 spin_lock_irqsave(&conn->c_lock, flags);
475
476 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
477 if (rm->m_rdma_op == op) {
478 atomic_inc(&rm->m_refcount);
479 found = rm;
480 goto out;
481 }
482 }
483
484 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
485 if (rm->m_rdma_op == op) {
486 atomic_inc(&rm->m_refcount);
487 found = rm;
488 break;
489 }
490 }
491
492out:
493 spin_unlock_irqrestore(&conn->c_lock, flags);
494
495 return found;
496}
497
498/*
499 * This removes messages from the socket's list if they're on it. The list
500 * argument must be private to the caller, we must be able to modify it
501 * without locks. The messages must have a reference held for their
502 * position on the list. This function will drop that reference after
503 * removing the messages from the 'messages' list regardless of if it found
504 * the messages on the socket list or not.
505 */
506void rds_send_remove_from_sock(struct list_head *messages, int status)
507{
508 unsigned long flags = 0; /* silence gcc :P */
509 struct rds_sock *rs = NULL;
510 struct rds_message *rm;
511
512 local_irq_save(flags);
513 while (!list_empty(messages)) {
514 rm = list_entry(messages->next, struct rds_message,
515 m_conn_item);
516 list_del_init(&rm->m_conn_item);
517
518 /*
519 * If we see this flag cleared then we're *sure* that someone
520 * else beat us to removing it from the sock. If we race
521 * with their flag update we'll get the lock and then really
522 * see that the flag has been cleared.
523 *
524 * The message spinlock makes sure nobody clears rm->m_rs
525 * while we're messing with it. It does not prevent the
526 * message from being removed from the socket, though.
527 */
528 spin_lock(&rm->m_rs_lock);
529 if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
530 goto unlock_and_drop;
531
532 if (rs != rm->m_rs) {
533 if (rs) {
534 spin_unlock(&rs->rs_lock);
535 rds_wake_sk_sleep(rs);
536 sock_put(rds_rs_to_sk(rs));
537 }
538 rs = rm->m_rs;
539 spin_lock(&rs->rs_lock);
540 sock_hold(rds_rs_to_sk(rs));
541 }
542
543 if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
544 struct rds_rdma_op *ro = rm->m_rdma_op;
545 struct rds_notifier *notifier;
546
547 list_del_init(&rm->m_sock_item);
548 rds_send_sndbuf_remove(rs, rm);
549
550 if (ro && ro->r_notifier
551 && (status || ro->r_notify)) {
552 notifier = ro->r_notifier;
553 list_add_tail(&notifier->n_list,
554 &rs->rs_notify_queue);
555 if (!notifier->n_status)
556 notifier->n_status = status;
557 rm->m_rdma_op->r_notifier = NULL;
558 }
559 rds_message_put(rm);
560 rm->m_rs = NULL;
561 }
562
563unlock_and_drop:
564 spin_unlock(&rm->m_rs_lock);
565 rds_message_put(rm);
566 }
567
568 if (rs) {
569 spin_unlock(&rs->rs_lock);
570 rds_wake_sk_sleep(rs);
571 sock_put(rds_rs_to_sk(rs));
572 }
573 local_irq_restore(flags);
574}
575
576/*
577 * Transports call here when they've determined that the receiver queued
578 * messages up to, and including, the given sequence number. Messages are
579 * moved to the retrans queue when rds_send_xmit picks them off the send
580 * queue. This means that in the TCP case, the message may not have been
581 * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
582 * checks the RDS_MSG_HAS_ACK_SEQ bit.
583 *
584 * XXX It's not clear to me how this is safely serialized with socket
585 * destruction. Maybe it should bail if it sees SOCK_DEAD.
586 */
587void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
588 is_acked_func is_acked)
589{
590 struct rds_message *rm, *tmp;
591 unsigned long flags;
592 LIST_HEAD(list);
593
594 spin_lock_irqsave(&conn->c_lock, flags);
595
596 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
597 if (!rds_send_is_acked(rm, ack, is_acked))
598 break;
599
600 list_move(&rm->m_conn_item, &list);
601 clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
602 }
603
604 /* order flag updates with spin locks */
605 if (!list_empty(&list))
606 smp_mb__after_clear_bit();
607
608 spin_unlock_irqrestore(&conn->c_lock, flags);
609
610 /* now remove the messages from the sock list as needed */
611 rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
612}
613
614void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
615{
616 struct rds_message *rm, *tmp;
617 struct rds_connection *conn;
618 unsigned long flags;
619 LIST_HEAD(list);
620 int wake = 0;
621
622 /* get all the messages we're dropping under the rs lock */
623 spin_lock_irqsave(&rs->rs_lock, flags);
624
625 list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
626 if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
627 dest->sin_port != rm->m_inc.i_hdr.h_dport))
628 continue;
629
630 wake = 1;
631 list_move(&rm->m_sock_item, &list);
632 rds_send_sndbuf_remove(rs, rm);
633 clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
634
635 /* If this is a RDMA operation, notify the app. */
636 __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
637 }
638
639 /* order flag updates with the rs lock */
640 if (wake)
641 smp_mb__after_clear_bit();
642
643 spin_unlock_irqrestore(&rs->rs_lock, flags);
644
645 if (wake)
646 rds_wake_sk_sleep(rs);
647
648 conn = NULL;
649
650 /* now remove the messages from the conn list as needed */
651 list_for_each_entry(rm, &list, m_sock_item) {
652 /* We do this here rather than in the loop above, so that
653 * we don't have to nest m_rs_lock under rs->rs_lock */
654 spin_lock(&rm->m_rs_lock);
655 rm->m_rs = NULL;
656 spin_unlock(&rm->m_rs_lock);
657
658 /*
659 * If we see this flag cleared then we're *sure* that someone
660 * else beat us to removing it from the conn. If we race
661 * with their flag update we'll get the lock and then really
662 * see that the flag has been cleared.
663 */
664 if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
665 continue;
666
667 if (conn != rm->m_inc.i_conn) {
668 if (conn)
669 spin_unlock_irqrestore(&conn->c_lock, flags);
670 conn = rm->m_inc.i_conn;
671 spin_lock_irqsave(&conn->c_lock, flags);
672 }
673
674 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
675 list_del_init(&rm->m_conn_item);
676 rds_message_put(rm);
677 }
678 }
679
680 if (conn)
681 spin_unlock_irqrestore(&conn->c_lock, flags);
682
683 while (!list_empty(&list)) {
684 rm = list_entry(list.next, struct rds_message, m_sock_item);
685 list_del_init(&rm->m_sock_item);
686
687 rds_message_wait(rm);
688 rds_message_put(rm);
689 }
690}
691
692/*
693 * we only want this to fire once so we use the callers 'queued'. It's
694 * possible that another thread can race with us and remove the
695 * message from the flow with RDS_CANCEL_SENT_TO.
696 */
697static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
698 struct rds_message *rm, __be16 sport,
699 __be16 dport, int *queued)
700{
701 unsigned long flags;
702 u32 len;
703
704 if (*queued)
705 goto out;
706
707 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
708
709 /* this is the only place which holds both the socket's rs_lock
710 * and the connection's c_lock */
711 spin_lock_irqsave(&rs->rs_lock, flags);
712
713 /*
714 * If there is a little space in sndbuf, we don't queue anything,
715 * and userspace gets -EAGAIN. But poll() indicates there's send
716 * room. This can lead to bad behavior (spinning) if snd_bytes isn't
717 * freed up by incoming acks. So we check the *old* value of
718 * rs_snd_bytes here to allow the last msg to exceed the buffer,
719 * and poll() now knows no more data can be sent.
720 */
721 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
722 rs->rs_snd_bytes += len;
723
724 /* let recv side know we are close to send space exhaustion.
725 * This is probably not the optimal way to do it, as this
726 * means we set the flag on *all* messages as soon as our
727 * throughput hits a certain threshold.
728 */
729 if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
730 __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
731
732 list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
733 set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
734 rds_message_addref(rm);
735 rm->m_rs = rs;
736
737 /* The code ordering is a little weird, but we're
738 trying to minimize the time we hold c_lock */
739 rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
740 rm->m_inc.i_conn = conn;
741 rds_message_addref(rm);
742
743 spin_lock(&conn->c_lock);
744 rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
745 list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
746 set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
747 spin_unlock(&conn->c_lock);
748
749 rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
750 rm, len, rs, rs->rs_snd_bytes,
751 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
752
753 *queued = 1;
754 }
755
756 spin_unlock_irqrestore(&rs->rs_lock, flags);
757out:
758 return *queued;
759}
760
761static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
762 struct msghdr *msg, int *allocated_mr)
763{
764 struct cmsghdr *cmsg;
765 int ret = 0;
766
767 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
768 if (!CMSG_OK(msg, cmsg))
769 return -EINVAL;
770
771 if (cmsg->cmsg_level != SOL_RDS)
772 continue;
773
774 /* As a side effect, RDMA_DEST and RDMA_MAP will set
775 * rm->m_rdma_cookie and rm->m_rdma_mr.
776 */
777 switch (cmsg->cmsg_type) {
778 case RDS_CMSG_RDMA_ARGS:
779 ret = rds_cmsg_rdma_args(rs, rm, cmsg);
780 break;
781
782 case RDS_CMSG_RDMA_DEST:
783 ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
784 break;
785
786 case RDS_CMSG_RDMA_MAP:
787 ret = rds_cmsg_rdma_map(rs, rm, cmsg);
788 if (!ret)
789 *allocated_mr = 1;
790 break;
791
792 default:
793 return -EINVAL;
794 }
795
796 if (ret)
797 break;
798 }
799
800 return ret;
801}
802
803int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
804 size_t payload_len)
805{
806 struct sock *sk = sock->sk;
807 struct rds_sock *rs = rds_sk_to_rs(sk);
808 struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
809 __be32 daddr;
810 __be16 dport;
811 struct rds_message *rm = NULL;
812 struct rds_connection *conn;
813 int ret = 0;
814 int queued = 0, allocated_mr = 0;
815 int nonblock = msg->msg_flags & MSG_DONTWAIT;
816 long timeo = sock_rcvtimeo(sk, nonblock);
817
818 /* Mirror Linux UDP mirror of BSD error message compatibility */
819 /* XXX: Perhaps MSG_MORE someday */
820 if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
821 printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
822 ret = -EOPNOTSUPP;
823 goto out;
824 }
825
826 if (msg->msg_namelen) {
827 /* XXX fail non-unicast destination IPs? */
828 if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
829 ret = -EINVAL;
830 goto out;
831 }
832 daddr = usin->sin_addr.s_addr;
833 dport = usin->sin_port;
834 } else {
835 /* We only care about consistency with ->connect() */
836 lock_sock(sk);
837 daddr = rs->rs_conn_addr;
838 dport = rs->rs_conn_port;
839 release_sock(sk);
840 }
841
842 /* racing with another thread binding seems ok here */
843 if (daddr == 0 || rs->rs_bound_addr == 0) {
844 ret = -ENOTCONN; /* XXX not a great errno */
845 goto out;
846 }
847
848 rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
849 if (IS_ERR(rm)) {
850 ret = PTR_ERR(rm);
851 rm = NULL;
852 goto out;
853 }
854
855 rm->m_daddr = daddr;
856
857 /* Parse any control messages the user may have included. */
858 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
859 if (ret)
860 goto out;
861
862 /* rds_conn_create has a spinlock that runs with IRQ off.
863 * Caching the conn in the socket helps a lot. */
864 if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
865 conn = rs->rs_conn;
866 else {
867 conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
868 rs->rs_transport,
869 sock->sk->sk_allocation);
870 if (IS_ERR(conn)) {
871 ret = PTR_ERR(conn);
872 goto out;
873 }
874 rs->rs_conn = conn;
875 }
876
877 if ((rm->m_rdma_cookie || rm->m_rdma_op)
878 && conn->c_trans->xmit_rdma == NULL) {
879 if (printk_ratelimit())
880 printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
881 rm->m_rdma_op, conn->c_trans->xmit_rdma);
882 ret = -EOPNOTSUPP;
883 goto out;
884 }
885
886 /* If the connection is down, trigger a connect. We may
887 * have scheduled a delayed reconnect however - in this case
888 * we should not interfere.
889 */
890 if (rds_conn_state(conn) == RDS_CONN_DOWN
891 && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
892 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
893
894 ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
895 if (ret)
896 goto out;
897
898 while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
899 dport, &queued)) {
900 rds_stats_inc(s_send_queue_full);
901 /* XXX make sure this is reasonable */
902 if (payload_len > rds_sk_sndbuf(rs)) {
903 ret = -EMSGSIZE;
904 goto out;
905 }
906 if (nonblock) {
907 ret = -EAGAIN;
908 goto out;
909 }
910
911 timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
912 rds_send_queue_rm(rs, conn, rm,
913 rs->rs_bound_port,
914 dport,
915 &queued),
916 timeo);
917 rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
918 if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
919 continue;
920
921 ret = timeo;
922 if (ret == 0)
923 ret = -ETIMEDOUT;
924 goto out;
925 }
926
927 /*
928 * By now we've committed to the send. We reuse rds_send_worker()
929 * to retry sends in the rds thread if the transport asks us to.
930 */
931 rds_stats_inc(s_send_queued);
932
933 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
934 rds_send_worker(&conn->c_send_w.work);
935
936 rds_message_put(rm);
937 return payload_len;
938
939out:
940 /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
941 * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
942 * or in any other way, we need to destroy the MR again */
943 if (allocated_mr)
944 rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
945
946 if (rm)
947 rds_message_put(rm);
948 return ret;
949}
950
951/*
952 * Reply to a ping packet.
953 */
954int
955rds_send_pong(struct rds_connection *conn, __be16 dport)
956{
957 struct rds_message *rm;
958 unsigned long flags;
959 int ret = 0;
960
961 rm = rds_message_alloc(0, GFP_ATOMIC);
962 if (rm == NULL) {
963 ret = -ENOMEM;
964 goto out;
965 }
966
967 rm->m_daddr = conn->c_faddr;
968
969 /* If the connection is down, trigger a connect. We may
970 * have scheduled a delayed reconnect however - in this case
971 * we should not interfere.
972 */
973 if (rds_conn_state(conn) == RDS_CONN_DOWN
974 && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
975 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
976
977 ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
978 if (ret)
979 goto out;
980
981 spin_lock_irqsave(&conn->c_lock, flags);
982 list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
983 set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
984 rds_message_addref(rm);
985 rm->m_inc.i_conn = conn;
986
987 rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
988 conn->c_next_tx_seq);
989 conn->c_next_tx_seq++;
990 spin_unlock_irqrestore(&conn->c_lock, flags);
991
992 rds_stats_inc(s_send_queued);
993 rds_stats_inc(s_send_pong);
994
995 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
996 rds_message_put(rm);
997 return 0;
998
999out:
1000 if (rm)
1001 rds_message_put(rm);
1002 return ret;
1003}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 000000000000..637146893cf3
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,148 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38
39DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
40
41/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
42
43static char *rds_stat_names[] = {
44 "conn_reset",
45 "recv_drop_bad_checksum",
46 "recv_drop_old_seq",
47 "recv_drop_no_sock",
48 "recv_drop_dead_sock",
49 "recv_deliver_raced",
50 "recv_delivered",
51 "recv_queued",
52 "recv_immediate_retry",
53 "recv_delayed_retry",
54 "recv_ack_required",
55 "recv_rdma_bytes",
56 "recv_ping",
57 "send_queue_empty",
58 "send_queue_full",
59 "send_sem_contention",
60 "send_sem_queue_raced",
61 "send_immediate_retry",
62 "send_delayed_retry",
63 "send_drop_acked",
64 "send_ack_required",
65 "send_queued",
66 "send_rdma",
67 "send_rdma_bytes",
68 "send_pong",
69 "page_remainder_hit",
70 "page_remainder_miss",
71 "copy_to_user",
72 "copy_from_user",
73 "cong_update_queued",
74 "cong_update_received",
75 "cong_send_error",
76 "cong_send_blocked",
77};
78
79void rds_stats_info_copy(struct rds_info_iterator *iter,
80 uint64_t *values, char **names, size_t nr)
81{
82 struct rds_info_counter ctr;
83 size_t i;
84
85 for (i = 0; i < nr; i++) {
86 BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
87 strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
88 ctr.value = values[i];
89
90 rds_info_copy(iter, &ctr, sizeof(ctr));
91 }
92}
93
94/*
95 * This gives global counters across all the transports. The strings
96 * are copied in so that the tool doesn't need knowledge of the specific
97 * stats that we're exporting. Some are pretty implementation dependent
98 * and may change over time. That doesn't stop them from being useful.
99 *
100 * This is the only function in the chain that knows about the byte granular
101 * length in userspace. It converts it to number of stat entries that the
102 * rest of the functions operate in.
103 */
104static void rds_stats_info(struct socket *sock, unsigned int len,
105 struct rds_info_iterator *iter,
106 struct rds_info_lengths *lens)
107{
108 struct rds_statistics stats = {0, };
109 uint64_t *src;
110 uint64_t *sum;
111 size_t i;
112 int cpu;
113 unsigned int avail;
114
115 avail = len / sizeof(struct rds_info_counter);
116
117 if (avail < ARRAY_SIZE(rds_stat_names)) {
118 avail = 0;
119 goto trans;
120 }
121
122 for_each_online_cpu(cpu) {
123 src = (uint64_t *)&(per_cpu(rds_stats, cpu));
124 sum = (uint64_t *)&stats;
125 for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
126 *(sum++) += *(src++);
127 }
128
129 rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
130 ARRAY_SIZE(rds_stat_names));
131 avail -= ARRAY_SIZE(rds_stat_names);
132
133trans:
134 lens->each = sizeof(struct rds_info_counter);
135 lens->nr = rds_trans_stats_info_copy(iter, avail) +
136 ARRAY_SIZE(rds_stat_names);
137}
138
139void rds_stats_exit(void)
140{
141 rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
142}
143
144int __init rds_stats_init(void)
145{
146 rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
147 return 0;
148}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 000000000000..307dc5c1be15
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,122 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/sysctl.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38
39static struct ctl_table_header *rds_sysctl_reg_table;
40
41static unsigned long rds_sysctl_reconnect_min = 1;
42static unsigned long rds_sysctl_reconnect_max = ~0UL;
43
44unsigned long rds_sysctl_reconnect_min_jiffies;
45unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
46
47unsigned int rds_sysctl_max_unacked_packets = 8;
48unsigned int rds_sysctl_max_unacked_bytes = (16 << 20);
49
50unsigned int rds_sysctl_ping_enable = 1;
51
52static ctl_table rds_sysctl_rds_table[] = {
53 {
54 .ctl_name = CTL_UNNUMBERED,
55 .procname = "reconnect_min_delay_ms",
56 .data = &rds_sysctl_reconnect_min_jiffies,
57 .maxlen = sizeof(unsigned long),
58 .mode = 0644,
59 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
60 .extra1 = &rds_sysctl_reconnect_min,
61 .extra2 = &rds_sysctl_reconnect_max_jiffies,
62 },
63 {
64 .ctl_name = CTL_UNNUMBERED,
65 .procname = "reconnect_max_delay_ms",
66 .data = &rds_sysctl_reconnect_max_jiffies,
67 .maxlen = sizeof(unsigned long),
68 .mode = 0644,
69 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
70 .extra1 = &rds_sysctl_reconnect_min_jiffies,
71 .extra2 = &rds_sysctl_reconnect_max,
72 },
73 {
74 .ctl_name = CTL_UNNUMBERED,
75 .procname = "max_unacked_packets",
76 .data = &rds_sysctl_max_unacked_packets,
77 .maxlen = sizeof(unsigned long),
78 .mode = 0644,
79 .proc_handler = &proc_dointvec,
80 },
81 {
82 .ctl_name = CTL_UNNUMBERED,
83 .procname = "max_unacked_bytes",
84 .data = &rds_sysctl_max_unacked_bytes,
85 .maxlen = sizeof(unsigned long),
86 .mode = 0644,
87 .proc_handler = &proc_dointvec,
88 },
89 {
90 .ctl_name = CTL_UNNUMBERED,
91 .procname = "ping_enable",
92 .data = &rds_sysctl_ping_enable,
93 .maxlen = sizeof(int),
94 .mode = 0644,
95 .proc_handler = &proc_dointvec,
96 },
97 { .ctl_name = 0}
98};
99
100static struct ctl_path rds_sysctl_path[] = {
101 { .procname = "net", .ctl_name = CTL_NET, },
102 { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
103 { }
104};
105
106
107void rds_sysctl_exit(void)
108{
109 if (rds_sysctl_reg_table)
110 unregister_sysctl_table(rds_sysctl_reg_table);
111}
112
113int __init rds_sysctl_init(void)
114{
115 rds_sysctl_reconnect_min = msecs_to_jiffies(1);
116 rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
117
118 rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
119 if (rds_sysctl_reg_table == NULL)
120 return -ENOMEM;
121 return 0;
122}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 000000000000..828a1bf9ea92
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,265 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/random.h>
35
36#include "rds.h"
37
38/*
39 * All of connection management is simplified by serializing it through
40 * work queues that execute in a connection managing thread.
41 *
42 * TCP wants to send acks through sendpage() in response to data_ready(),
43 * but it needs a process context to do so.
44 *
45 * The receive paths need to allocate but can't drop packets (!) so we have
46 * a thread around to block allocating if the receive fast path sees an
47 * allocation failure.
48 */
49
50/* Grand Unified Theory of connection life cycle:
51 * At any point in time, the connection can be in one of these states:
52 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
53 *
54 * The following transitions are possible:
55 * ANY -> ERROR
56 * UP -> DISCONNECTING
57 * ERROR -> DISCONNECTING
58 * DISCONNECTING -> DOWN
59 * DOWN -> CONNECTING
60 * CONNECTING -> UP
61 *
62 * Transition to state DISCONNECTING/DOWN:
63 * - Inside the shutdown worker; synchronizes with xmit path
64 * through c_send_lock, and with connection management callbacks
65 * via c_cm_lock.
66 *
67 * For receive callbacks, we rely on the underlying transport
68 * (TCP, IB/RDMA) to provide the necessary synchronisation.
69 */
70struct workqueue_struct *rds_wq;
71
72void rds_connect_complete(struct rds_connection *conn)
73{
74 if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
75 printk(KERN_WARNING "%s: Cannot transition to state UP, "
76 "current state is %d\n",
77 __func__,
78 atomic_read(&conn->c_state));
79 atomic_set(&conn->c_state, RDS_CONN_ERROR);
80 queue_work(rds_wq, &conn->c_down_w);
81 return;
82 }
83
84 rdsdebug("conn %p for %pI4 to %pI4 complete\n",
85 conn, &conn->c_laddr, &conn->c_faddr);
86
87 conn->c_reconnect_jiffies = 0;
88 set_bit(0, &conn->c_map_queued);
89 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
90 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
91}
92
93/*
94 * This random exponential backoff is relied on to eventually resolve racing
95 * connects.
96 *
97 * If connect attempts race then both parties drop both connections and come
98 * here to wait for a random amount of time before trying again. Eventually
99 * the backoff range will be so much greater than the time it takes to
100 * establish a connection that one of the pair will establish the connection
101 * before the other's random delay fires.
102 *
103 * Connection attempts that arrive while a connection is already established
104 * are also considered to be racing connects. This lets a connection from
105 * a rebooted machine replace an existing stale connection before the transport
106 * notices that the connection has failed.
107 *
108 * We should *always* start with a random backoff; otherwise a broken connection
109 * will always take several iterations to be re-established.
110 */
111static void rds_queue_reconnect(struct rds_connection *conn)
112{
113 unsigned long rand;
114
115 rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
116 conn, &conn->c_laddr, &conn->c_faddr,
117 conn->c_reconnect_jiffies);
118
119 set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
120 if (conn->c_reconnect_jiffies == 0) {
121 conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
122 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
123 return;
124 }
125
126 get_random_bytes(&rand, sizeof(rand));
127 rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
128 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
129 conn, &conn->c_laddr, &conn->c_faddr);
130 queue_delayed_work(rds_wq, &conn->c_conn_w,
131 rand % conn->c_reconnect_jiffies);
132
133 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
134 rds_sysctl_reconnect_max_jiffies);
135}
136
137void rds_connect_worker(struct work_struct *work)
138{
139 struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
140 int ret;
141
142 clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
143 if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
144 ret = conn->c_trans->conn_connect(conn);
145 rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
146 conn, &conn->c_laddr, &conn->c_faddr, ret);
147
148 if (ret) {
149 if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
150 rds_queue_reconnect(conn);
151 else
152 rds_conn_error(conn, "RDS: connect failed\n");
153 }
154 }
155}
156
157void rds_shutdown_worker(struct work_struct *work)
158{
159 struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
160
161 /* shut it down unless it's down already */
162 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
163 /*
164 * Quiesce the connection mgmt handlers before we start tearing
165 * things down. We don't hold the mutex for the entire
166 * duration of the shutdown operation, else we may be
167 * deadlocking with the CM handler. Instead, the CM event
168 * handler is supposed to check for state DISCONNECTING
169 */
170 mutex_lock(&conn->c_cm_lock);
171 if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
172 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
173 rds_conn_error(conn, "shutdown called in state %d\n",
174 atomic_read(&conn->c_state));
175 mutex_unlock(&conn->c_cm_lock);
176 return;
177 }
178 mutex_unlock(&conn->c_cm_lock);
179
180 mutex_lock(&conn->c_send_lock);
181 conn->c_trans->conn_shutdown(conn);
182 rds_conn_reset(conn);
183 mutex_unlock(&conn->c_send_lock);
184
185 if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
186 /* This can happen - eg when we're in the middle of tearing
187 * down the connection, and someone unloads the rds module.
188 * Quite reproduceable with loopback connections.
189 * Mostly harmless.
190 */
191 rds_conn_error(conn,
192 "%s: failed to transition to state DOWN, "
193 "current state is %d\n",
194 __func__,
195 atomic_read(&conn->c_state));
196 return;
197 }
198 }
199
200 /* Then reconnect if it's still live.
201 * The passive side of an IB loopback connection is never added
202 * to the conn hash, so we never trigger a reconnect on this
203 * conn - the reconnect is always triggered by the active peer. */
204 cancel_delayed_work(&conn->c_conn_w);
205 if (!hlist_unhashed(&conn->c_hash_node))
206 rds_queue_reconnect(conn);
207}
208
209void rds_send_worker(struct work_struct *work)
210{
211 struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
212 int ret;
213
214 if (rds_conn_state(conn) == RDS_CONN_UP) {
215 ret = rds_send_xmit(conn);
216 rdsdebug("conn %p ret %d\n", conn, ret);
217 switch (ret) {
218 case -EAGAIN:
219 rds_stats_inc(s_send_immediate_retry);
220 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
221 break;
222 case -ENOMEM:
223 rds_stats_inc(s_send_delayed_retry);
224 queue_delayed_work(rds_wq, &conn->c_send_w, 2);
225 default:
226 break;
227 }
228 }
229}
230
231void rds_recv_worker(struct work_struct *work)
232{
233 struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
234 int ret;
235
236 if (rds_conn_state(conn) == RDS_CONN_UP) {
237 ret = conn->c_trans->recv(conn);
238 rdsdebug("conn %p ret %d\n", conn, ret);
239 switch (ret) {
240 case -EAGAIN:
241 rds_stats_inc(s_recv_immediate_retry);
242 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
243 break;
244 case -ENOMEM:
245 rds_stats_inc(s_recv_delayed_retry);
246 queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
247 default:
248 break;
249 }
250 }
251}
252
253void rds_threads_exit(void)
254{
255 destroy_workqueue(rds_wq);
256}
257
258int __init rds_threads_init(void)
259{
260 rds_wq = create_singlethread_workqueue("krdsd");
261 if (rds_wq == NULL)
262 return -ENOMEM;
263
264 return 0;
265}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 000000000000..767da61ad2f3
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,117 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/module.h>
35#include <linux/in.h>
36
37#include "rds.h"
38#include "loop.h"
39
40static LIST_HEAD(rds_transports);
41static DECLARE_RWSEM(rds_trans_sem);
42
43int rds_trans_register(struct rds_transport *trans)
44{
45 BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
46
47 down_write(&rds_trans_sem);
48
49 list_add_tail(&trans->t_item, &rds_transports);
50 printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
51
52 up_write(&rds_trans_sem);
53
54 return 0;
55}
56
57void rds_trans_unregister(struct rds_transport *trans)
58{
59 down_write(&rds_trans_sem);
60
61 list_del_init(&trans->t_item);
62 printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
63
64 up_write(&rds_trans_sem);
65}
66
67struct rds_transport *rds_trans_get_preferred(__be32 addr)
68{
69 struct rds_transport *trans;
70 struct rds_transport *ret = NULL;
71
72 if (IN_LOOPBACK(ntohl(addr)))
73 return &rds_loop_transport;
74
75 down_read(&rds_trans_sem);
76 list_for_each_entry(trans, &rds_transports, t_item) {
77 if (trans->laddr_check(addr) == 0) {
78 ret = trans;
79 break;
80 }
81 }
82 up_read(&rds_trans_sem);
83
84 return ret;
85}
86
87/*
88 * This returns the number of stats entries in the snapshot and only
89 * copies them using the iter if there is enough space for them. The
90 * caller passes in the global stats so that we can size and copy while
91 * holding the lock.
92 */
93unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
94 unsigned int avail)
95
96{
97 struct rds_transport *trans;
98 unsigned int total = 0;
99 unsigned int part;
100
101 rds_info_iter_unmap(iter);
102 down_read(&rds_trans_sem);
103
104 list_for_each_entry(trans, &rds_transports, t_item) {
105 if (trans->stats_info_copy == NULL)
106 continue;
107
108 part = trans->stats_info_copy(iter, avail);
109 avail -= min(avail, part);
110 total += part;
111 }
112
113 up_read(&rds_trans_sem);
114
115 return total;
116}
117