diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-03-30 17:53:32 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-03-30 17:53:32 -0400 |
commit | 65fb0d23fcddd8697c871047b700c78817bdaa43 (patch) | |
tree | 119e6e5f276622c4c862f6c9b6d795264ba1603a /net/rds | |
parent | 8c083f081d0014057901c68a0a3e0f8ca7ac8d23 (diff) | |
parent | dfbbe89e197a77f2c8046a51c74e33e35f878080 (diff) |
Merge branch 'linus' into cpumask-for-linus
Conflicts:
arch/x86/kernel/cpu/common.c
Diffstat (limited to 'net/rds')
41 files changed, 15494 insertions, 0 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig new file mode 100644 index 000000000000..796773b5df9b --- /dev/null +++ b/net/rds/Kconfig | |||
@@ -0,0 +1,14 @@ | |||
1 | |||
2 | config RDS | ||
3 | tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)" | ||
4 | depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL | ||
5 | depends on INFINIBAND && INFINIBAND_ADDR_TRANS | ||
6 | ---help--- | ||
7 | RDS provides reliable, sequenced delivery of datagrams | ||
8 | over Infiniband. | ||
9 | |||
10 | config RDS_DEBUG | ||
11 | bool "Debugging messages" | ||
12 | depends on RDS | ||
13 | default n | ||
14 | |||
diff --git a/net/rds/Makefile b/net/rds/Makefile new file mode 100644 index 000000000000..51f27585fa08 --- /dev/null +++ b/net/rds/Makefile | |||
@@ -0,0 +1,14 @@ | |||
1 | obj-$(CONFIG_RDS) += rds.o | ||
2 | rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ | ||
3 | recv.o send.o stats.o sysctl.o threads.o transport.o \ | ||
4 | loop.o page.o rdma.o \ | ||
5 | rdma_transport.o \ | ||
6 | ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ | ||
7 | ib_sysctl.o ib_rdma.o \ | ||
8 | iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ | ||
9 | iw_sysctl.o iw_rdma.o | ||
10 | |||
11 | ifeq ($(CONFIG_RDS_DEBUG), y) | ||
12 | EXTRA_CFLAGS += -DDEBUG | ||
13 | endif | ||
14 | |||
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c new file mode 100644 index 000000000000..20cf16fc572f --- /dev/null +++ b/net/rds/af_rds.c | |||
@@ -0,0 +1,586 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/module.h> | ||
34 | #include <linux/errno.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/in.h> | ||
37 | #include <linux/poll.h> | ||
38 | #include <linux/version.h> | ||
39 | #include <net/sock.h> | ||
40 | |||
41 | #include "rds.h" | ||
42 | #include "rdma.h" | ||
43 | #include "rdma_transport.h" | ||
44 | |||
45 | /* this is just used for stats gathering :/ */ | ||
46 | static DEFINE_SPINLOCK(rds_sock_lock); | ||
47 | static unsigned long rds_sock_count; | ||
48 | static LIST_HEAD(rds_sock_list); | ||
49 | DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); | ||
50 | |||
51 | /* | ||
52 | * This is called as the final descriptor referencing this socket is closed. | ||
53 | * We have to unbind the socket so that another socket can be bound to the | ||
54 | * address it was using. | ||
55 | * | ||
56 | * We have to be careful about racing with the incoming path. sock_orphan() | ||
57 | * sets SOCK_DEAD and we use that as an indicator to the rx path that new | ||
58 | * messages shouldn't be queued. | ||
59 | */ | ||
60 | static int rds_release(struct socket *sock) | ||
61 | { | ||
62 | struct sock *sk = sock->sk; | ||
63 | struct rds_sock *rs; | ||
64 | unsigned long flags; | ||
65 | |||
66 | if (sk == NULL) | ||
67 | goto out; | ||
68 | |||
69 | rs = rds_sk_to_rs(sk); | ||
70 | |||
71 | sock_orphan(sk); | ||
72 | /* Note - rds_clear_recv_queue grabs rs_recv_lock, so | ||
73 | * that ensures the recv path has completed messing | ||
74 | * with the socket. */ | ||
75 | rds_clear_recv_queue(rs); | ||
76 | rds_cong_remove_socket(rs); | ||
77 | rds_remove_bound(rs); | ||
78 | rds_send_drop_to(rs, NULL); | ||
79 | rds_rdma_drop_keys(rs); | ||
80 | rds_notify_queue_get(rs, NULL); | ||
81 | |||
82 | spin_lock_irqsave(&rds_sock_lock, flags); | ||
83 | list_del_init(&rs->rs_item); | ||
84 | rds_sock_count--; | ||
85 | spin_unlock_irqrestore(&rds_sock_lock, flags); | ||
86 | |||
87 | sock->sk = NULL; | ||
88 | sock_put(sk); | ||
89 | out: | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. | ||
95 | * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK | ||
96 | * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but | ||
97 | * this seems more conservative. | ||
98 | * NB - normally, one would use sk_callback_lock for this, but we can | ||
99 | * get here from interrupts, whereas the network code grabs sk_callback_lock | ||
100 | * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. | ||
101 | */ | ||
102 | void rds_wake_sk_sleep(struct rds_sock *rs) | ||
103 | { | ||
104 | unsigned long flags; | ||
105 | |||
106 | read_lock_irqsave(&rs->rs_recv_lock, flags); | ||
107 | __rds_wake_sk_sleep(rds_rs_to_sk(rs)); | ||
108 | read_unlock_irqrestore(&rs->rs_recv_lock, flags); | ||
109 | } | ||
110 | |||
111 | static int rds_getname(struct socket *sock, struct sockaddr *uaddr, | ||
112 | int *uaddr_len, int peer) | ||
113 | { | ||
114 | struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; | ||
115 | struct rds_sock *rs = rds_sk_to_rs(sock->sk); | ||
116 | |||
117 | memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); | ||
118 | |||
119 | /* racey, don't care */ | ||
120 | if (peer) { | ||
121 | if (!rs->rs_conn_addr) | ||
122 | return -ENOTCONN; | ||
123 | |||
124 | sin->sin_port = rs->rs_conn_port; | ||
125 | sin->sin_addr.s_addr = rs->rs_conn_addr; | ||
126 | } else { | ||
127 | sin->sin_port = rs->rs_bound_port; | ||
128 | sin->sin_addr.s_addr = rs->rs_bound_addr; | ||
129 | } | ||
130 | |||
131 | sin->sin_family = AF_INET; | ||
132 | |||
133 | *uaddr_len = sizeof(*sin); | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * RDS' poll is without a doubt the least intuitive part of the interface, | ||
139 | * as POLLIN and POLLOUT do not behave entirely as you would expect from | ||
140 | * a network protocol. | ||
141 | * | ||
142 | * POLLIN is asserted if | ||
143 | * - there is data on the receive queue. | ||
144 | * - to signal that a previously congested destination may have become | ||
145 | * uncongested | ||
146 | * - A notification has been queued to the socket (this can be a congestion | ||
147 | * update, or a RDMA completion). | ||
148 | * | ||
149 | * POLLOUT is asserted if there is room on the send queue. This does not mean | ||
150 | * however, that the next sendmsg() call will succeed. If the application tries | ||
151 | * to send to a congested destination, the system call may still fail (and | ||
152 | * return ENOBUFS). | ||
153 | */ | ||
154 | static unsigned int rds_poll(struct file *file, struct socket *sock, | ||
155 | poll_table *wait) | ||
156 | { | ||
157 | struct sock *sk = sock->sk; | ||
158 | struct rds_sock *rs = rds_sk_to_rs(sk); | ||
159 | unsigned int mask = 0; | ||
160 | unsigned long flags; | ||
161 | |||
162 | poll_wait(file, sk->sk_sleep, wait); | ||
163 | |||
164 | poll_wait(file, &rds_poll_waitq, wait); | ||
165 | |||
166 | read_lock_irqsave(&rs->rs_recv_lock, flags); | ||
167 | if (!rs->rs_cong_monitor) { | ||
168 | /* When a congestion map was updated, we signal POLLIN for | ||
169 | * "historical" reasons. Applications can also poll for | ||
170 | * WRBAND instead. */ | ||
171 | if (rds_cong_updated_since(&rs->rs_cong_track)) | ||
172 | mask |= (POLLIN | POLLRDNORM | POLLWRBAND); | ||
173 | } else { | ||
174 | spin_lock(&rs->rs_lock); | ||
175 | if (rs->rs_cong_notify) | ||
176 | mask |= (POLLIN | POLLRDNORM); | ||
177 | spin_unlock(&rs->rs_lock); | ||
178 | } | ||
179 | if (!list_empty(&rs->rs_recv_queue) | ||
180 | || !list_empty(&rs->rs_notify_queue)) | ||
181 | mask |= (POLLIN | POLLRDNORM); | ||
182 | if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) | ||
183 | mask |= (POLLOUT | POLLWRNORM); | ||
184 | read_unlock_irqrestore(&rs->rs_recv_lock, flags); | ||
185 | |||
186 | return mask; | ||
187 | } | ||
188 | |||
189 | static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | ||
190 | { | ||
191 | return -ENOIOCTLCMD; | ||
192 | } | ||
193 | |||
194 | static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, | ||
195 | int len) | ||
196 | { | ||
197 | struct sockaddr_in sin; | ||
198 | int ret = 0; | ||
199 | |||
200 | /* racing with another thread binding seems ok here */ | ||
201 | if (rs->rs_bound_addr == 0) { | ||
202 | ret = -ENOTCONN; /* XXX not a great errno */ | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | if (len < sizeof(struct sockaddr_in)) { | ||
207 | ret = -EINVAL; | ||
208 | goto out; | ||
209 | } | ||
210 | |||
211 | if (copy_from_user(&sin, optval, sizeof(sin))) { | ||
212 | ret = -EFAULT; | ||
213 | goto out; | ||
214 | } | ||
215 | |||
216 | rds_send_drop_to(rs, &sin); | ||
217 | out: | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | static int rds_set_bool_option(unsigned char *optvar, char __user *optval, | ||
222 | int optlen) | ||
223 | { | ||
224 | int value; | ||
225 | |||
226 | if (optlen < sizeof(int)) | ||
227 | return -EINVAL; | ||
228 | if (get_user(value, (int __user *) optval)) | ||
229 | return -EFAULT; | ||
230 | *optvar = !!value; | ||
231 | return 0; | ||
232 | } | ||
233 | |||
234 | static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, | ||
235 | int optlen) | ||
236 | { | ||
237 | int ret; | ||
238 | |||
239 | ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); | ||
240 | if (ret == 0) { | ||
241 | if (rs->rs_cong_monitor) { | ||
242 | rds_cong_add_socket(rs); | ||
243 | } else { | ||
244 | rds_cong_remove_socket(rs); | ||
245 | rs->rs_cong_mask = 0; | ||
246 | rs->rs_cong_notify = 0; | ||
247 | } | ||
248 | } | ||
249 | return ret; | ||
250 | } | ||
251 | |||
252 | static int rds_setsockopt(struct socket *sock, int level, int optname, | ||
253 | char __user *optval, int optlen) | ||
254 | { | ||
255 | struct rds_sock *rs = rds_sk_to_rs(sock->sk); | ||
256 | int ret; | ||
257 | |||
258 | if (level != SOL_RDS) { | ||
259 | ret = -ENOPROTOOPT; | ||
260 | goto out; | ||
261 | } | ||
262 | |||
263 | switch (optname) { | ||
264 | case RDS_CANCEL_SENT_TO: | ||
265 | ret = rds_cancel_sent_to(rs, optval, optlen); | ||
266 | break; | ||
267 | case RDS_GET_MR: | ||
268 | ret = rds_get_mr(rs, optval, optlen); | ||
269 | break; | ||
270 | case RDS_FREE_MR: | ||
271 | ret = rds_free_mr(rs, optval, optlen); | ||
272 | break; | ||
273 | case RDS_RECVERR: | ||
274 | ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); | ||
275 | break; | ||
276 | case RDS_CONG_MONITOR: | ||
277 | ret = rds_cong_monitor(rs, optval, optlen); | ||
278 | break; | ||
279 | default: | ||
280 | ret = -ENOPROTOOPT; | ||
281 | } | ||
282 | out: | ||
283 | return ret; | ||
284 | } | ||
285 | |||
286 | static int rds_getsockopt(struct socket *sock, int level, int optname, | ||
287 | char __user *optval, int __user *optlen) | ||
288 | { | ||
289 | struct rds_sock *rs = rds_sk_to_rs(sock->sk); | ||
290 | int ret = -ENOPROTOOPT, len; | ||
291 | |||
292 | if (level != SOL_RDS) | ||
293 | goto out; | ||
294 | |||
295 | if (get_user(len, optlen)) { | ||
296 | ret = -EFAULT; | ||
297 | goto out; | ||
298 | } | ||
299 | |||
300 | switch (optname) { | ||
301 | case RDS_INFO_FIRST ... RDS_INFO_LAST: | ||
302 | ret = rds_info_getsockopt(sock, optname, optval, | ||
303 | optlen); | ||
304 | break; | ||
305 | |||
306 | case RDS_RECVERR: | ||
307 | if (len < sizeof(int)) | ||
308 | ret = -EINVAL; | ||
309 | else | ||
310 | if (put_user(rs->rs_recverr, (int __user *) optval) | ||
311 | || put_user(sizeof(int), optlen)) | ||
312 | ret = -EFAULT; | ||
313 | else | ||
314 | ret = 0; | ||
315 | break; | ||
316 | default: | ||
317 | break; | ||
318 | } | ||
319 | |||
320 | out: | ||
321 | return ret; | ||
322 | |||
323 | } | ||
324 | |||
325 | static int rds_connect(struct socket *sock, struct sockaddr *uaddr, | ||
326 | int addr_len, int flags) | ||
327 | { | ||
328 | struct sock *sk = sock->sk; | ||
329 | struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; | ||
330 | struct rds_sock *rs = rds_sk_to_rs(sk); | ||
331 | int ret = 0; | ||
332 | |||
333 | lock_sock(sk); | ||
334 | |||
335 | if (addr_len != sizeof(struct sockaddr_in)) { | ||
336 | ret = -EINVAL; | ||
337 | goto out; | ||
338 | } | ||
339 | |||
340 | if (sin->sin_family != AF_INET) { | ||
341 | ret = -EAFNOSUPPORT; | ||
342 | goto out; | ||
343 | } | ||
344 | |||
345 | if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { | ||
346 | ret = -EDESTADDRREQ; | ||
347 | goto out; | ||
348 | } | ||
349 | |||
350 | rs->rs_conn_addr = sin->sin_addr.s_addr; | ||
351 | rs->rs_conn_port = sin->sin_port; | ||
352 | |||
353 | out: | ||
354 | release_sock(sk); | ||
355 | return ret; | ||
356 | } | ||
357 | |||
358 | static struct proto rds_proto = { | ||
359 | .name = "RDS", | ||
360 | .owner = THIS_MODULE, | ||
361 | .obj_size = sizeof(struct rds_sock), | ||
362 | }; | ||
363 | |||
364 | static struct proto_ops rds_proto_ops = { | ||
365 | .family = AF_RDS, | ||
366 | .owner = THIS_MODULE, | ||
367 | .release = rds_release, | ||
368 | .bind = rds_bind, | ||
369 | .connect = rds_connect, | ||
370 | .socketpair = sock_no_socketpair, | ||
371 | .accept = sock_no_accept, | ||
372 | .getname = rds_getname, | ||
373 | .poll = rds_poll, | ||
374 | .ioctl = rds_ioctl, | ||
375 | .listen = sock_no_listen, | ||
376 | .shutdown = sock_no_shutdown, | ||
377 | .setsockopt = rds_setsockopt, | ||
378 | .getsockopt = rds_getsockopt, | ||
379 | .sendmsg = rds_sendmsg, | ||
380 | .recvmsg = rds_recvmsg, | ||
381 | .mmap = sock_no_mmap, | ||
382 | .sendpage = sock_no_sendpage, | ||
383 | }; | ||
384 | |||
385 | static int __rds_create(struct socket *sock, struct sock *sk, int protocol) | ||
386 | { | ||
387 | unsigned long flags; | ||
388 | struct rds_sock *rs; | ||
389 | |||
390 | sock_init_data(sock, sk); | ||
391 | sock->ops = &rds_proto_ops; | ||
392 | sk->sk_protocol = protocol; | ||
393 | |||
394 | rs = rds_sk_to_rs(sk); | ||
395 | spin_lock_init(&rs->rs_lock); | ||
396 | rwlock_init(&rs->rs_recv_lock); | ||
397 | INIT_LIST_HEAD(&rs->rs_send_queue); | ||
398 | INIT_LIST_HEAD(&rs->rs_recv_queue); | ||
399 | INIT_LIST_HEAD(&rs->rs_notify_queue); | ||
400 | INIT_LIST_HEAD(&rs->rs_cong_list); | ||
401 | spin_lock_init(&rs->rs_rdma_lock); | ||
402 | rs->rs_rdma_keys = RB_ROOT; | ||
403 | |||
404 | spin_lock_irqsave(&rds_sock_lock, flags); | ||
405 | list_add_tail(&rs->rs_item, &rds_sock_list); | ||
406 | rds_sock_count++; | ||
407 | spin_unlock_irqrestore(&rds_sock_lock, flags); | ||
408 | |||
409 | return 0; | ||
410 | } | ||
411 | |||
412 | static int rds_create(struct net *net, struct socket *sock, int protocol) | ||
413 | { | ||
414 | struct sock *sk; | ||
415 | |||
416 | if (sock->type != SOCK_SEQPACKET || protocol) | ||
417 | return -ESOCKTNOSUPPORT; | ||
418 | |||
419 | sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); | ||
420 | if (!sk) | ||
421 | return -ENOMEM; | ||
422 | |||
423 | return __rds_create(sock, sk, protocol); | ||
424 | } | ||
425 | |||
426 | void rds_sock_addref(struct rds_sock *rs) | ||
427 | { | ||
428 | sock_hold(rds_rs_to_sk(rs)); | ||
429 | } | ||
430 | |||
431 | void rds_sock_put(struct rds_sock *rs) | ||
432 | { | ||
433 | sock_put(rds_rs_to_sk(rs)); | ||
434 | } | ||
435 | |||
436 | static struct net_proto_family rds_family_ops = { | ||
437 | .family = AF_RDS, | ||
438 | .create = rds_create, | ||
439 | .owner = THIS_MODULE, | ||
440 | }; | ||
441 | |||
442 | static void rds_sock_inc_info(struct socket *sock, unsigned int len, | ||
443 | struct rds_info_iterator *iter, | ||
444 | struct rds_info_lengths *lens) | ||
445 | { | ||
446 | struct rds_sock *rs; | ||
447 | struct sock *sk; | ||
448 | struct rds_incoming *inc; | ||
449 | unsigned long flags; | ||
450 | unsigned int total = 0; | ||
451 | |||
452 | len /= sizeof(struct rds_info_message); | ||
453 | |||
454 | spin_lock_irqsave(&rds_sock_lock, flags); | ||
455 | |||
456 | list_for_each_entry(rs, &rds_sock_list, rs_item) { | ||
457 | sk = rds_rs_to_sk(rs); | ||
458 | read_lock(&rs->rs_recv_lock); | ||
459 | |||
460 | /* XXX too lazy to maintain counts.. */ | ||
461 | list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { | ||
462 | total++; | ||
463 | if (total <= len) | ||
464 | rds_inc_info_copy(inc, iter, inc->i_saddr, | ||
465 | rs->rs_bound_addr, 1); | ||
466 | } | ||
467 | |||
468 | read_unlock(&rs->rs_recv_lock); | ||
469 | } | ||
470 | |||
471 | spin_unlock_irqrestore(&rds_sock_lock, flags); | ||
472 | |||
473 | lens->nr = total; | ||
474 | lens->each = sizeof(struct rds_info_message); | ||
475 | } | ||
476 | |||
477 | static void rds_sock_info(struct socket *sock, unsigned int len, | ||
478 | struct rds_info_iterator *iter, | ||
479 | struct rds_info_lengths *lens) | ||
480 | { | ||
481 | struct rds_info_socket sinfo; | ||
482 | struct rds_sock *rs; | ||
483 | unsigned long flags; | ||
484 | |||
485 | len /= sizeof(struct rds_info_socket); | ||
486 | |||
487 | spin_lock_irqsave(&rds_sock_lock, flags); | ||
488 | |||
489 | if (len < rds_sock_count) | ||
490 | goto out; | ||
491 | |||
492 | list_for_each_entry(rs, &rds_sock_list, rs_item) { | ||
493 | sinfo.sndbuf = rds_sk_sndbuf(rs); | ||
494 | sinfo.rcvbuf = rds_sk_rcvbuf(rs); | ||
495 | sinfo.bound_addr = rs->rs_bound_addr; | ||
496 | sinfo.connected_addr = rs->rs_conn_addr; | ||
497 | sinfo.bound_port = rs->rs_bound_port; | ||
498 | sinfo.connected_port = rs->rs_conn_port; | ||
499 | sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); | ||
500 | |||
501 | rds_info_copy(iter, &sinfo, sizeof(sinfo)); | ||
502 | } | ||
503 | |||
504 | out: | ||
505 | lens->nr = rds_sock_count; | ||
506 | lens->each = sizeof(struct rds_info_socket); | ||
507 | |||
508 | spin_unlock_irqrestore(&rds_sock_lock, flags); | ||
509 | } | ||
510 | |||
511 | static void __exit rds_exit(void) | ||
512 | { | ||
513 | rds_rdma_exit(); | ||
514 | sock_unregister(rds_family_ops.family); | ||
515 | proto_unregister(&rds_proto); | ||
516 | rds_conn_exit(); | ||
517 | rds_cong_exit(); | ||
518 | rds_sysctl_exit(); | ||
519 | rds_threads_exit(); | ||
520 | rds_stats_exit(); | ||
521 | rds_page_exit(); | ||
522 | rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); | ||
523 | rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); | ||
524 | } | ||
525 | module_exit(rds_exit); | ||
526 | |||
527 | static int __init rds_init(void) | ||
528 | { | ||
529 | int ret; | ||
530 | |||
531 | ret = rds_conn_init(); | ||
532 | if (ret) | ||
533 | goto out; | ||
534 | ret = rds_threads_init(); | ||
535 | if (ret) | ||
536 | goto out_conn; | ||
537 | ret = rds_sysctl_init(); | ||
538 | if (ret) | ||
539 | goto out_threads; | ||
540 | ret = rds_stats_init(); | ||
541 | if (ret) | ||
542 | goto out_sysctl; | ||
543 | ret = proto_register(&rds_proto, 1); | ||
544 | if (ret) | ||
545 | goto out_stats; | ||
546 | ret = sock_register(&rds_family_ops); | ||
547 | if (ret) | ||
548 | goto out_proto; | ||
549 | |||
550 | rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); | ||
551 | rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); | ||
552 | |||
553 | /* ib/iwarp transports currently compiled-in */ | ||
554 | ret = rds_rdma_init(); | ||
555 | if (ret) | ||
556 | goto out_sock; | ||
557 | goto out; | ||
558 | |||
559 | out_sock: | ||
560 | sock_unregister(rds_family_ops.family); | ||
561 | out_proto: | ||
562 | proto_unregister(&rds_proto); | ||
563 | out_stats: | ||
564 | rds_stats_exit(); | ||
565 | out_sysctl: | ||
566 | rds_sysctl_exit(); | ||
567 | out_threads: | ||
568 | rds_threads_exit(); | ||
569 | out_conn: | ||
570 | rds_conn_exit(); | ||
571 | rds_cong_exit(); | ||
572 | rds_page_exit(); | ||
573 | out: | ||
574 | return ret; | ||
575 | } | ||
576 | module_init(rds_init); | ||
577 | |||
578 | #define DRV_VERSION "4.0" | ||
579 | #define DRV_RELDATE "Feb 12, 2009" | ||
580 | |||
581 | MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); | ||
582 | MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" | ||
583 | " v" DRV_VERSION " (" DRV_RELDATE ")"); | ||
584 | MODULE_VERSION(DRV_VERSION); | ||
585 | MODULE_LICENSE("Dual BSD/GPL"); | ||
586 | MODULE_ALIAS_NETPROTO(PF_RDS); | ||
diff --git a/net/rds/bind.c b/net/rds/bind.c new file mode 100644 index 000000000000..c17cc39160ce --- /dev/null +++ b/net/rds/bind.c | |||
@@ -0,0 +1,199 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <net/sock.h> | ||
35 | #include <linux/in.h> | ||
36 | #include <linux/if_arp.h> | ||
37 | #include "rds.h" | ||
38 | |||
39 | /* | ||
40 | * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't | ||
41 | * particularly zippy. | ||
42 | * | ||
43 | * This is now called for every incoming frame so we arguably care much more | ||
44 | * about it than we used to. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(rds_bind_lock); | ||
47 | static struct rb_root rds_bind_tree = RB_ROOT; | ||
48 | |||
49 | static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, | ||
50 | struct rds_sock *insert) | ||
51 | { | ||
52 | struct rb_node **p = &rds_bind_tree.rb_node; | ||
53 | struct rb_node *parent = NULL; | ||
54 | struct rds_sock *rs; | ||
55 | u64 cmp; | ||
56 | u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); | ||
57 | |||
58 | while (*p) { | ||
59 | parent = *p; | ||
60 | rs = rb_entry(parent, struct rds_sock, rs_bound_node); | ||
61 | |||
62 | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | | ||
63 | be16_to_cpu(rs->rs_bound_port); | ||
64 | |||
65 | if (needle < cmp) | ||
66 | p = &(*p)->rb_left; | ||
67 | else if (needle > cmp) | ||
68 | p = &(*p)->rb_right; | ||
69 | else | ||
70 | return rs; | ||
71 | } | ||
72 | |||
73 | if (insert) { | ||
74 | rb_link_node(&insert->rs_bound_node, parent, p); | ||
75 | rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); | ||
76 | } | ||
77 | return NULL; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Return the rds_sock bound at the given local address. | ||
82 | * | ||
83 | * The rx path can race with rds_release. We notice if rds_release() has | ||
84 | * marked this socket and don't return a rs ref to the rx path. | ||
85 | */ | ||
86 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port) | ||
87 | { | ||
88 | struct rds_sock *rs; | ||
89 | unsigned long flags; | ||
90 | |||
91 | spin_lock_irqsave(&rds_bind_lock, flags); | ||
92 | rs = rds_bind_tree_walk(addr, port, NULL); | ||
93 | if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) | ||
94 | rds_sock_addref(rs); | ||
95 | else | ||
96 | rs = NULL; | ||
97 | spin_unlock_irqrestore(&rds_bind_lock, flags); | ||
98 | |||
99 | rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, | ||
100 | ntohs(port)); | ||
101 | return rs; | ||
102 | } | ||
103 | |||
104 | /* returns -ve errno or +ve port */ | ||
105 | static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) | ||
106 | { | ||
107 | unsigned long flags; | ||
108 | int ret = -EADDRINUSE; | ||
109 | u16 rover, last; | ||
110 | |||
111 | if (*port != 0) { | ||
112 | rover = be16_to_cpu(*port); | ||
113 | last = rover; | ||
114 | } else { | ||
115 | rover = max_t(u16, net_random(), 2); | ||
116 | last = rover - 1; | ||
117 | } | ||
118 | |||
119 | spin_lock_irqsave(&rds_bind_lock, flags); | ||
120 | |||
121 | do { | ||
122 | if (rover == 0) | ||
123 | rover++; | ||
124 | if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { | ||
125 | *port = cpu_to_be16(rover); | ||
126 | ret = 0; | ||
127 | break; | ||
128 | } | ||
129 | } while (rover++ != last); | ||
130 | |||
131 | if (ret == 0) { | ||
132 | rs->rs_bound_addr = addr; | ||
133 | rs->rs_bound_port = *port; | ||
134 | rds_sock_addref(rs); | ||
135 | |||
136 | rdsdebug("rs %p binding to %pI4:%d\n", | ||
137 | rs, &addr, (int)ntohs(*port)); | ||
138 | } | ||
139 | |||
140 | spin_unlock_irqrestore(&rds_bind_lock, flags); | ||
141 | |||
142 | return ret; | ||
143 | } | ||
144 | |||
145 | void rds_remove_bound(struct rds_sock *rs) | ||
146 | { | ||
147 | unsigned long flags; | ||
148 | |||
149 | spin_lock_irqsave(&rds_bind_lock, flags); | ||
150 | |||
151 | if (rs->rs_bound_addr) { | ||
152 | rdsdebug("rs %p unbinding from %pI4:%d\n", | ||
153 | rs, &rs->rs_bound_addr, | ||
154 | ntohs(rs->rs_bound_port)); | ||
155 | |||
156 | rb_erase(&rs->rs_bound_node, &rds_bind_tree); | ||
157 | rds_sock_put(rs); | ||
158 | rs->rs_bound_addr = 0; | ||
159 | } | ||
160 | |||
161 | spin_unlock_irqrestore(&rds_bind_lock, flags); | ||
162 | } | ||
163 | |||
164 | int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | ||
165 | { | ||
166 | struct sock *sk = sock->sk; | ||
167 | struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; | ||
168 | struct rds_sock *rs = rds_sk_to_rs(sk); | ||
169 | struct rds_transport *trans; | ||
170 | int ret = 0; | ||
171 | |||
172 | lock_sock(sk); | ||
173 | |||
174 | if (addr_len != sizeof(struct sockaddr_in) || | ||
175 | sin->sin_family != AF_INET || | ||
176 | rs->rs_bound_addr || | ||
177 | sin->sin_addr.s_addr == htonl(INADDR_ANY)) { | ||
178 | ret = -EINVAL; | ||
179 | goto out; | ||
180 | } | ||
181 | |||
182 | ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); | ||
183 | if (ret) | ||
184 | goto out; | ||
185 | |||
186 | trans = rds_trans_get_preferred(sin->sin_addr.s_addr); | ||
187 | if (trans == NULL) { | ||
188 | ret = -EADDRNOTAVAIL; | ||
189 | rds_remove_bound(rs); | ||
190 | goto out; | ||
191 | } | ||
192 | |||
193 | rs->rs_transport = trans; | ||
194 | ret = 0; | ||
195 | |||
196 | out: | ||
197 | release_sock(sk); | ||
198 | return ret; | ||
199 | } | ||
diff --git a/net/rds/cong.c b/net/rds/cong.c new file mode 100644 index 000000000000..710e4599d76c --- /dev/null +++ b/net/rds/cong.c | |||
@@ -0,0 +1,404 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/types.h> | ||
34 | #include <linux/rbtree.h> | ||
35 | |||
36 | #include <asm-generic/bitops/le.h> | ||
37 | |||
38 | #include "rds.h" | ||
39 | |||
40 | /* | ||
41 | * This file implements the receive side of the unconventional congestion | ||
42 | * management in RDS. | ||
43 | * | ||
44 | * Messages waiting in the receive queue on the receiving socket are accounted | ||
45 | * against the sockets SO_RCVBUF option value. Only the payload bytes in the | ||
46 | * message are accounted for. If the number of bytes queued equals or exceeds | ||
47 | * rcvbuf then the socket is congested. All sends attempted to this socket's | ||
48 | * address should return block or return -EWOULDBLOCK. | ||
49 | * | ||
50 | * Applications are expected to be reasonably tuned such that this situation | ||
51 | * very rarely occurs. An application encountering this "back-pressure" is | ||
52 | * considered a bug. | ||
53 | * | ||
54 | * This is implemented by having each node maintain bitmaps which indicate | ||
55 | * which ports on bound addresses are congested. As the bitmap changes it is | ||
56 | * sent through all the connections which terminate in the local address of the | ||
57 | * bitmap which changed. | ||
58 | * | ||
59 | * The bitmaps are allocated as connections are brought up. This avoids | ||
60 | * allocation in the interrupt handling path which queues messages on sockets. | ||
61 | * The dense bitmaps let transports send the entire bitmap on any bitmap change | ||
62 | * reasonably efficiently. This is much easier to implement than some | ||
63 | * finer-grained communication of per-port congestion. The sender does a very | ||
64 | * inexpensive bit test to test if the port it's about to send to is congested | ||
65 | * or not. | ||
66 | */ | ||
67 | |||
68 | /* | ||
69 | * Interaction with poll is a tad tricky. We want all processes stuck in | ||
70 | * poll to wake up and check whether a congested destination became uncongested. | ||
71 | * The really sad thing is we have no idea which destinations the application | ||
72 | * wants to send to - we don't even know which rds_connections are involved. | ||
73 | * So until we implement a more flexible rds poll interface, we have to make | ||
74 | * do with this: | ||
75 | * We maintain a global counter that is incremented each time a congestion map | ||
76 | * update is received. Each rds socket tracks this value, and if rds_poll | ||
77 | * finds that the saved generation number is smaller than the global generation | ||
78 | * number, it wakes up the process. | ||
79 | */ | ||
80 | static atomic_t rds_cong_generation = ATOMIC_INIT(0); | ||
81 | |||
82 | /* | ||
83 | * Congestion monitoring | ||
84 | */ | ||
85 | static LIST_HEAD(rds_cong_monitor); | ||
86 | static DEFINE_RWLOCK(rds_cong_monitor_lock); | ||
87 | |||
88 | /* | ||
89 | * Yes, a global lock. It's used so infrequently that it's worth keeping it | ||
90 | * global to simplify the locking. It's only used in the following | ||
91 | * circumstances: | ||
92 | * | ||
93 | * - on connection buildup to associate a conn with its maps | ||
94 | * - on map changes to inform conns of a new map to send | ||
95 | * | ||
96 | * It's sadly ordered under the socket callback lock and the connection lock. | ||
97 | * Receive paths can mark ports congested from interrupt context so the | ||
98 | * lock masks interrupts. | ||
99 | */ | ||
100 | static DEFINE_SPINLOCK(rds_cong_lock); | ||
101 | static struct rb_root rds_cong_tree = RB_ROOT; | ||
102 | |||
103 | static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, | ||
104 | struct rds_cong_map *insert) | ||
105 | { | ||
106 | struct rb_node **p = &rds_cong_tree.rb_node; | ||
107 | struct rb_node *parent = NULL; | ||
108 | struct rds_cong_map *map; | ||
109 | |||
110 | while (*p) { | ||
111 | parent = *p; | ||
112 | map = rb_entry(parent, struct rds_cong_map, m_rb_node); | ||
113 | |||
114 | if (addr < map->m_addr) | ||
115 | p = &(*p)->rb_left; | ||
116 | else if (addr > map->m_addr) | ||
117 | p = &(*p)->rb_right; | ||
118 | else | ||
119 | return map; | ||
120 | } | ||
121 | |||
122 | if (insert) { | ||
123 | rb_link_node(&insert->m_rb_node, parent, p); | ||
124 | rb_insert_color(&insert->m_rb_node, &rds_cong_tree); | ||
125 | } | ||
126 | return NULL; | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * There is only ever one bitmap for any address. Connections try and allocate | ||
131 | * these bitmaps in the process getting pointers to them. The bitmaps are only | ||
132 | * ever freed as the module is removed after all connections have been freed. | ||
133 | */ | ||
134 | static struct rds_cong_map *rds_cong_from_addr(__be32 addr) | ||
135 | { | ||
136 | struct rds_cong_map *map; | ||
137 | struct rds_cong_map *ret = NULL; | ||
138 | unsigned long zp; | ||
139 | unsigned long i; | ||
140 | unsigned long flags; | ||
141 | |||
142 | map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); | ||
143 | if (map == NULL) | ||
144 | return NULL; | ||
145 | |||
146 | map->m_addr = addr; | ||
147 | init_waitqueue_head(&map->m_waitq); | ||
148 | INIT_LIST_HEAD(&map->m_conn_list); | ||
149 | |||
150 | for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { | ||
151 | zp = get_zeroed_page(GFP_KERNEL); | ||
152 | if (zp == 0) | ||
153 | goto out; | ||
154 | map->m_page_addrs[i] = zp; | ||
155 | } | ||
156 | |||
157 | spin_lock_irqsave(&rds_cong_lock, flags); | ||
158 | ret = rds_cong_tree_walk(addr, map); | ||
159 | spin_unlock_irqrestore(&rds_cong_lock, flags); | ||
160 | |||
161 | if (ret == NULL) { | ||
162 | ret = map; | ||
163 | map = NULL; | ||
164 | } | ||
165 | |||
166 | out: | ||
167 | if (map) { | ||
168 | for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) | ||
169 | free_page(map->m_page_addrs[i]); | ||
170 | kfree(map); | ||
171 | } | ||
172 | |||
173 | rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); | ||
174 | |||
175 | return ret; | ||
176 | } | ||
177 | |||
178 | /* | ||
179 | * Put the conn on its local map's list. This is called when the conn is | ||
180 | * really added to the hash. It's nested under the rds_conn_lock, sadly. | ||
181 | */ | ||
182 | void rds_cong_add_conn(struct rds_connection *conn) | ||
183 | { | ||
184 | unsigned long flags; | ||
185 | |||
186 | rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); | ||
187 | spin_lock_irqsave(&rds_cong_lock, flags); | ||
188 | list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); | ||
189 | spin_unlock_irqrestore(&rds_cong_lock, flags); | ||
190 | } | ||
191 | |||
192 | void rds_cong_remove_conn(struct rds_connection *conn) | ||
193 | { | ||
194 | unsigned long flags; | ||
195 | |||
196 | rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); | ||
197 | spin_lock_irqsave(&rds_cong_lock, flags); | ||
198 | list_del_init(&conn->c_map_item); | ||
199 | spin_unlock_irqrestore(&rds_cong_lock, flags); | ||
200 | } | ||
201 | |||
202 | int rds_cong_get_maps(struct rds_connection *conn) | ||
203 | { | ||
204 | conn->c_lcong = rds_cong_from_addr(conn->c_laddr); | ||
205 | conn->c_fcong = rds_cong_from_addr(conn->c_faddr); | ||
206 | |||
207 | if (conn->c_lcong == NULL || conn->c_fcong == NULL) | ||
208 | return -ENOMEM; | ||
209 | |||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | void rds_cong_queue_updates(struct rds_cong_map *map) | ||
214 | { | ||
215 | struct rds_connection *conn; | ||
216 | unsigned long flags; | ||
217 | |||
218 | spin_lock_irqsave(&rds_cong_lock, flags); | ||
219 | |||
220 | list_for_each_entry(conn, &map->m_conn_list, c_map_item) { | ||
221 | if (!test_and_set_bit(0, &conn->c_map_queued)) { | ||
222 | rds_stats_inc(s_cong_update_queued); | ||
223 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
224 | } | ||
225 | } | ||
226 | |||
227 | spin_unlock_irqrestore(&rds_cong_lock, flags); | ||
228 | } | ||
229 | |||
230 | void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) | ||
231 | { | ||
232 | rdsdebug("waking map %p for %pI4\n", | ||
233 | map, &map->m_addr); | ||
234 | rds_stats_inc(s_cong_update_received); | ||
235 | atomic_inc(&rds_cong_generation); | ||
236 | if (waitqueue_active(&map->m_waitq)) | ||
237 | wake_up(&map->m_waitq); | ||
238 | if (waitqueue_active(&rds_poll_waitq)) | ||
239 | wake_up_all(&rds_poll_waitq); | ||
240 | |||
241 | if (portmask && !list_empty(&rds_cong_monitor)) { | ||
242 | unsigned long flags; | ||
243 | struct rds_sock *rs; | ||
244 | |||
245 | read_lock_irqsave(&rds_cong_monitor_lock, flags); | ||
246 | list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { | ||
247 | spin_lock(&rs->rs_lock); | ||
248 | rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); | ||
249 | rs->rs_cong_mask &= ~portmask; | ||
250 | spin_unlock(&rs->rs_lock); | ||
251 | if (rs->rs_cong_notify) | ||
252 | rds_wake_sk_sleep(rs); | ||
253 | } | ||
254 | read_unlock_irqrestore(&rds_cong_monitor_lock, flags); | ||
255 | } | ||
256 | } | ||
257 | |||
258 | int rds_cong_updated_since(unsigned long *recent) | ||
259 | { | ||
260 | unsigned long gen = atomic_read(&rds_cong_generation); | ||
261 | |||
262 | if (likely(*recent == gen)) | ||
263 | return 0; | ||
264 | *recent = gen; | ||
265 | return 1; | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * We're called under the locking that protects the sockets receive buffer | ||
270 | * consumption. This makes it a lot easier for the caller to only call us | ||
271 | * when it knows that an existing set bit needs to be cleared, and vice versa. | ||
272 | * We can't block and we need to deal with concurrent sockets working against | ||
273 | * the same per-address map. | ||
274 | */ | ||
275 | void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) | ||
276 | { | ||
277 | unsigned long i; | ||
278 | unsigned long off; | ||
279 | |||
280 | rdsdebug("setting congestion for %pI4:%u in map %p\n", | ||
281 | &map->m_addr, ntohs(port), map); | ||
282 | |||
283 | i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; | ||
284 | off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; | ||
285 | |||
286 | generic___set_le_bit(off, (void *)map->m_page_addrs[i]); | ||
287 | } | ||
288 | |||
289 | void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) | ||
290 | { | ||
291 | unsigned long i; | ||
292 | unsigned long off; | ||
293 | |||
294 | rdsdebug("clearing congestion for %pI4:%u in map %p\n", | ||
295 | &map->m_addr, ntohs(port), map); | ||
296 | |||
297 | i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; | ||
298 | off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; | ||
299 | |||
300 | generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); | ||
301 | } | ||
302 | |||
303 | static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) | ||
304 | { | ||
305 | unsigned long i; | ||
306 | unsigned long off; | ||
307 | |||
308 | i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; | ||
309 | off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; | ||
310 | |||
311 | return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); | ||
312 | } | ||
313 | |||
314 | void rds_cong_add_socket(struct rds_sock *rs) | ||
315 | { | ||
316 | unsigned long flags; | ||
317 | |||
318 | write_lock_irqsave(&rds_cong_monitor_lock, flags); | ||
319 | if (list_empty(&rs->rs_cong_list)) | ||
320 | list_add(&rs->rs_cong_list, &rds_cong_monitor); | ||
321 | write_unlock_irqrestore(&rds_cong_monitor_lock, flags); | ||
322 | } | ||
323 | |||
324 | void rds_cong_remove_socket(struct rds_sock *rs) | ||
325 | { | ||
326 | unsigned long flags; | ||
327 | struct rds_cong_map *map; | ||
328 | |||
329 | write_lock_irqsave(&rds_cong_monitor_lock, flags); | ||
330 | list_del_init(&rs->rs_cong_list); | ||
331 | write_unlock_irqrestore(&rds_cong_monitor_lock, flags); | ||
332 | |||
333 | /* update congestion map for now-closed port */ | ||
334 | spin_lock_irqsave(&rds_cong_lock, flags); | ||
335 | map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); | ||
336 | spin_unlock_irqrestore(&rds_cong_lock, flags); | ||
337 | |||
338 | if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { | ||
339 | rds_cong_clear_bit(map, rs->rs_bound_port); | ||
340 | rds_cong_queue_updates(map); | ||
341 | } | ||
342 | } | ||
343 | |||
344 | int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, | ||
345 | struct rds_sock *rs) | ||
346 | { | ||
347 | if (!rds_cong_test_bit(map, port)) | ||
348 | return 0; | ||
349 | if (nonblock) { | ||
350 | if (rs && rs->rs_cong_monitor) { | ||
351 | unsigned long flags; | ||
352 | |||
353 | /* It would have been nice to have an atomic set_bit on | ||
354 | * a uint64_t. */ | ||
355 | spin_lock_irqsave(&rs->rs_lock, flags); | ||
356 | rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); | ||
357 | spin_unlock_irqrestore(&rs->rs_lock, flags); | ||
358 | |||
359 | /* Test again - a congestion update may have arrived in | ||
360 | * the meantime. */ | ||
361 | if (!rds_cong_test_bit(map, port)) | ||
362 | return 0; | ||
363 | } | ||
364 | rds_stats_inc(s_cong_send_error); | ||
365 | return -ENOBUFS; | ||
366 | } | ||
367 | |||
368 | rds_stats_inc(s_cong_send_blocked); | ||
369 | rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); | ||
370 | |||
371 | return wait_event_interruptible(map->m_waitq, | ||
372 | !rds_cong_test_bit(map, port)); | ||
373 | } | ||
374 | |||
375 | void rds_cong_exit(void) | ||
376 | { | ||
377 | struct rb_node *node; | ||
378 | struct rds_cong_map *map; | ||
379 | unsigned long i; | ||
380 | |||
381 | while ((node = rb_first(&rds_cong_tree))) { | ||
382 | map = rb_entry(node, struct rds_cong_map, m_rb_node); | ||
383 | rdsdebug("freeing map %p\n", map); | ||
384 | rb_erase(&map->m_rb_node, &rds_cong_tree); | ||
385 | for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) | ||
386 | free_page(map->m_page_addrs[i]); | ||
387 | kfree(map); | ||
388 | } | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * Allocate a RDS message containing a congestion update. | ||
393 | */ | ||
394 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) | ||
395 | { | ||
396 | struct rds_cong_map *map = conn->c_lcong; | ||
397 | struct rds_message *rm; | ||
398 | |||
399 | rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); | ||
400 | if (!IS_ERR(rm)) | ||
401 | rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; | ||
402 | |||
403 | return rm; | ||
404 | } | ||
diff --git a/net/rds/connection.c b/net/rds/connection.c new file mode 100644 index 000000000000..273f064930a8 --- /dev/null +++ b/net/rds/connection.c | |||
@@ -0,0 +1,487 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/list.h> | ||
35 | #include <net/inet_hashtables.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "loop.h" | ||
39 | #include "rdma.h" | ||
40 | |||
41 | #define RDS_CONNECTION_HASH_BITS 12 | ||
42 | #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) | ||
43 | #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) | ||
44 | |||
45 | /* converting this to RCU is a chore for another day.. */ | ||
46 | static DEFINE_SPINLOCK(rds_conn_lock); | ||
47 | static unsigned long rds_conn_count; | ||
48 | static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; | ||
49 | static struct kmem_cache *rds_conn_slab; | ||
50 | |||
51 | static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) | ||
52 | { | ||
53 | /* Pass NULL, don't need struct net for hash */ | ||
54 | unsigned long hash = inet_ehashfn(NULL, | ||
55 | be32_to_cpu(laddr), 0, | ||
56 | be32_to_cpu(faddr), 0); | ||
57 | return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; | ||
58 | } | ||
59 | |||
60 | #define rds_conn_info_set(var, test, suffix) do { \ | ||
61 | if (test) \ | ||
62 | var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ | ||
63 | } while (0) | ||
64 | |||
65 | static inline int rds_conn_is_sending(struct rds_connection *conn) | ||
66 | { | ||
67 | int ret = 0; | ||
68 | |||
69 | if (!mutex_trylock(&conn->c_send_lock)) | ||
70 | ret = 1; | ||
71 | else | ||
72 | mutex_unlock(&conn->c_send_lock); | ||
73 | |||
74 | return ret; | ||
75 | } | ||
76 | |||
77 | static struct rds_connection *rds_conn_lookup(struct hlist_head *head, | ||
78 | __be32 laddr, __be32 faddr, | ||
79 | struct rds_transport *trans) | ||
80 | { | ||
81 | struct rds_connection *conn, *ret = NULL; | ||
82 | struct hlist_node *pos; | ||
83 | |||
84 | hlist_for_each_entry(conn, pos, head, c_hash_node) { | ||
85 | if (conn->c_faddr == faddr && conn->c_laddr == laddr && | ||
86 | conn->c_trans == trans) { | ||
87 | ret = conn; | ||
88 | break; | ||
89 | } | ||
90 | } | ||
91 | rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, | ||
92 | &laddr, &faddr); | ||
93 | return ret; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * This is called by transports as they're bringing down a connection. | ||
98 | * It clears partial message state so that the transport can start sending | ||
99 | * and receiving over this connection again in the future. It is up to | ||
100 | * the transport to have serialized this call with its send and recv. | ||
101 | */ | ||
102 | void rds_conn_reset(struct rds_connection *conn) | ||
103 | { | ||
104 | rdsdebug("connection %pI4 to %pI4 reset\n", | ||
105 | &conn->c_laddr, &conn->c_faddr); | ||
106 | |||
107 | rds_stats_inc(s_conn_reset); | ||
108 | rds_send_reset(conn); | ||
109 | conn->c_flags = 0; | ||
110 | |||
111 | /* Do not clear next_rx_seq here, else we cannot distinguish | ||
112 | * retransmitted packets from new packets, and will hand all | ||
113 | * of them to the application. That is not consistent with the | ||
114 | * reliability guarantees of RDS. */ | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * There is only every one 'conn' for a given pair of addresses in the | ||
119 | * system at a time. They contain messages to be retransmitted and so | ||
120 | * span the lifetime of the actual underlying transport connections. | ||
121 | * | ||
122 | * For now they are not garbage collected once they're created. They | ||
123 | * are torn down as the module is removed, if ever. | ||
124 | */ | ||
125 | static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | ||
126 | struct rds_transport *trans, gfp_t gfp, | ||
127 | int is_outgoing) | ||
128 | { | ||
129 | struct rds_connection *conn, *tmp, *parent = NULL; | ||
130 | struct hlist_head *head = rds_conn_bucket(laddr, faddr); | ||
131 | unsigned long flags; | ||
132 | int ret; | ||
133 | |||
134 | spin_lock_irqsave(&rds_conn_lock, flags); | ||
135 | conn = rds_conn_lookup(head, laddr, faddr, trans); | ||
136 | if (conn | ||
137 | && conn->c_loopback | ||
138 | && conn->c_trans != &rds_loop_transport | ||
139 | && !is_outgoing) { | ||
140 | /* This is a looped back IB connection, and we're | ||
141 | * called by the code handling the incoming connect. | ||
142 | * We need a second connection object into which we | ||
143 | * can stick the other QP. */ | ||
144 | parent = conn; | ||
145 | conn = parent->c_passive; | ||
146 | } | ||
147 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
148 | if (conn) | ||
149 | goto out; | ||
150 | |||
151 | conn = kmem_cache_alloc(rds_conn_slab, gfp); | ||
152 | if (conn == NULL) { | ||
153 | conn = ERR_PTR(-ENOMEM); | ||
154 | goto out; | ||
155 | } | ||
156 | |||
157 | memset(conn, 0, sizeof(*conn)); | ||
158 | |||
159 | INIT_HLIST_NODE(&conn->c_hash_node); | ||
160 | conn->c_version = RDS_PROTOCOL_3_0; | ||
161 | conn->c_laddr = laddr; | ||
162 | conn->c_faddr = faddr; | ||
163 | spin_lock_init(&conn->c_lock); | ||
164 | conn->c_next_tx_seq = 1; | ||
165 | |||
166 | mutex_init(&conn->c_send_lock); | ||
167 | INIT_LIST_HEAD(&conn->c_send_queue); | ||
168 | INIT_LIST_HEAD(&conn->c_retrans); | ||
169 | |||
170 | ret = rds_cong_get_maps(conn); | ||
171 | if (ret) { | ||
172 | kmem_cache_free(rds_conn_slab, conn); | ||
173 | conn = ERR_PTR(ret); | ||
174 | goto out; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * This is where a connection becomes loopback. If *any* RDS sockets | ||
179 | * can bind to the destination address then we'd rather the messages | ||
180 | * flow through loopback rather than either transport. | ||
181 | */ | ||
182 | if (rds_trans_get_preferred(faddr)) { | ||
183 | conn->c_loopback = 1; | ||
184 | if (is_outgoing && trans->t_prefer_loopback) { | ||
185 | /* "outgoing" connection - and the transport | ||
186 | * says it wants the connection handled by the | ||
187 | * loopback transport. This is what TCP does. | ||
188 | */ | ||
189 | trans = &rds_loop_transport; | ||
190 | } | ||
191 | } | ||
192 | |||
193 | conn->c_trans = trans; | ||
194 | |||
195 | ret = trans->conn_alloc(conn, gfp); | ||
196 | if (ret) { | ||
197 | kmem_cache_free(rds_conn_slab, conn); | ||
198 | conn = ERR_PTR(ret); | ||
199 | goto out; | ||
200 | } | ||
201 | |||
202 | atomic_set(&conn->c_state, RDS_CONN_DOWN); | ||
203 | conn->c_reconnect_jiffies = 0; | ||
204 | INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); | ||
205 | INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); | ||
206 | INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); | ||
207 | INIT_WORK(&conn->c_down_w, rds_shutdown_worker); | ||
208 | mutex_init(&conn->c_cm_lock); | ||
209 | conn->c_flags = 0; | ||
210 | |||
211 | rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", | ||
212 | conn, &laddr, &faddr, | ||
213 | trans->t_name ? trans->t_name : "[unknown]", | ||
214 | is_outgoing ? "(outgoing)" : ""); | ||
215 | |||
216 | spin_lock_irqsave(&rds_conn_lock, flags); | ||
217 | if (parent == NULL) { | ||
218 | tmp = rds_conn_lookup(head, laddr, faddr, trans); | ||
219 | if (tmp == NULL) | ||
220 | hlist_add_head(&conn->c_hash_node, head); | ||
221 | } else { | ||
222 | tmp = parent->c_passive; | ||
223 | if (!tmp) | ||
224 | parent->c_passive = conn; | ||
225 | } | ||
226 | |||
227 | if (tmp) { | ||
228 | trans->conn_free(conn->c_transport_data); | ||
229 | kmem_cache_free(rds_conn_slab, conn); | ||
230 | conn = tmp; | ||
231 | } else { | ||
232 | rds_cong_add_conn(conn); | ||
233 | rds_conn_count++; | ||
234 | } | ||
235 | |||
236 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
237 | |||
238 | out: | ||
239 | return conn; | ||
240 | } | ||
241 | |||
242 | struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, | ||
243 | struct rds_transport *trans, gfp_t gfp) | ||
244 | { | ||
245 | return __rds_conn_create(laddr, faddr, trans, gfp, 0); | ||
246 | } | ||
247 | |||
248 | struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, | ||
249 | struct rds_transport *trans, gfp_t gfp) | ||
250 | { | ||
251 | return __rds_conn_create(laddr, faddr, trans, gfp, 1); | ||
252 | } | ||
253 | |||
254 | void rds_conn_destroy(struct rds_connection *conn) | ||
255 | { | ||
256 | struct rds_message *rm, *rtmp; | ||
257 | |||
258 | rdsdebug("freeing conn %p for %pI4 -> " | ||
259 | "%pI4\n", conn, &conn->c_laddr, | ||
260 | &conn->c_faddr); | ||
261 | |||
262 | hlist_del_init(&conn->c_hash_node); | ||
263 | |||
264 | /* wait for the rds thread to shut it down */ | ||
265 | atomic_set(&conn->c_state, RDS_CONN_ERROR); | ||
266 | cancel_delayed_work(&conn->c_conn_w); | ||
267 | queue_work(rds_wq, &conn->c_down_w); | ||
268 | flush_workqueue(rds_wq); | ||
269 | |||
270 | /* tear down queued messages */ | ||
271 | list_for_each_entry_safe(rm, rtmp, | ||
272 | &conn->c_send_queue, | ||
273 | m_conn_item) { | ||
274 | list_del_init(&rm->m_conn_item); | ||
275 | BUG_ON(!list_empty(&rm->m_sock_item)); | ||
276 | rds_message_put(rm); | ||
277 | } | ||
278 | if (conn->c_xmit_rm) | ||
279 | rds_message_put(conn->c_xmit_rm); | ||
280 | |||
281 | conn->c_trans->conn_free(conn->c_transport_data); | ||
282 | |||
283 | /* | ||
284 | * The congestion maps aren't freed up here. They're | ||
285 | * freed by rds_cong_exit() after all the connections | ||
286 | * have been freed. | ||
287 | */ | ||
288 | rds_cong_remove_conn(conn); | ||
289 | |||
290 | BUG_ON(!list_empty(&conn->c_retrans)); | ||
291 | kmem_cache_free(rds_conn_slab, conn); | ||
292 | |||
293 | rds_conn_count--; | ||
294 | } | ||
295 | |||
296 | static void rds_conn_message_info(struct socket *sock, unsigned int len, | ||
297 | struct rds_info_iterator *iter, | ||
298 | struct rds_info_lengths *lens, | ||
299 | int want_send) | ||
300 | { | ||
301 | struct hlist_head *head; | ||
302 | struct hlist_node *pos; | ||
303 | struct list_head *list; | ||
304 | struct rds_connection *conn; | ||
305 | struct rds_message *rm; | ||
306 | unsigned long flags; | ||
307 | unsigned int total = 0; | ||
308 | size_t i; | ||
309 | |||
310 | len /= sizeof(struct rds_info_message); | ||
311 | |||
312 | spin_lock_irqsave(&rds_conn_lock, flags); | ||
313 | |||
314 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); | ||
315 | i++, head++) { | ||
316 | hlist_for_each_entry(conn, pos, head, c_hash_node) { | ||
317 | if (want_send) | ||
318 | list = &conn->c_send_queue; | ||
319 | else | ||
320 | list = &conn->c_retrans; | ||
321 | |||
322 | spin_lock(&conn->c_lock); | ||
323 | |||
324 | /* XXX too lazy to maintain counts.. */ | ||
325 | list_for_each_entry(rm, list, m_conn_item) { | ||
326 | total++; | ||
327 | if (total <= len) | ||
328 | rds_inc_info_copy(&rm->m_inc, iter, | ||
329 | conn->c_laddr, | ||
330 | conn->c_faddr, 0); | ||
331 | } | ||
332 | |||
333 | spin_unlock(&conn->c_lock); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
338 | |||
339 | lens->nr = total; | ||
340 | lens->each = sizeof(struct rds_info_message); | ||
341 | } | ||
342 | |||
343 | static void rds_conn_message_info_send(struct socket *sock, unsigned int len, | ||
344 | struct rds_info_iterator *iter, | ||
345 | struct rds_info_lengths *lens) | ||
346 | { | ||
347 | rds_conn_message_info(sock, len, iter, lens, 1); | ||
348 | } | ||
349 | |||
350 | static void rds_conn_message_info_retrans(struct socket *sock, | ||
351 | unsigned int len, | ||
352 | struct rds_info_iterator *iter, | ||
353 | struct rds_info_lengths *lens) | ||
354 | { | ||
355 | rds_conn_message_info(sock, len, iter, lens, 0); | ||
356 | } | ||
357 | |||
358 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, | ||
359 | struct rds_info_iterator *iter, | ||
360 | struct rds_info_lengths *lens, | ||
361 | int (*visitor)(struct rds_connection *, void *), | ||
362 | size_t item_len) | ||
363 | { | ||
364 | uint64_t buffer[(item_len + 7) / 8]; | ||
365 | struct hlist_head *head; | ||
366 | struct hlist_node *pos; | ||
367 | struct hlist_node *tmp; | ||
368 | struct rds_connection *conn; | ||
369 | unsigned long flags; | ||
370 | size_t i; | ||
371 | |||
372 | spin_lock_irqsave(&rds_conn_lock, flags); | ||
373 | |||
374 | lens->nr = 0; | ||
375 | lens->each = item_len; | ||
376 | |||
377 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); | ||
378 | i++, head++) { | ||
379 | hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { | ||
380 | |||
381 | /* XXX no c_lock usage.. */ | ||
382 | if (!visitor(conn, buffer)) | ||
383 | continue; | ||
384 | |||
385 | /* We copy as much as we can fit in the buffer, | ||
386 | * but we count all items so that the caller | ||
387 | * can resize the buffer. */ | ||
388 | if (len >= item_len) { | ||
389 | rds_info_copy(iter, buffer, item_len); | ||
390 | len -= item_len; | ||
391 | } | ||
392 | lens->nr++; | ||
393 | } | ||
394 | } | ||
395 | |||
396 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
397 | } | ||
398 | |||
399 | static int rds_conn_info_visitor(struct rds_connection *conn, | ||
400 | void *buffer) | ||
401 | { | ||
402 | struct rds_info_connection *cinfo = buffer; | ||
403 | |||
404 | cinfo->next_tx_seq = conn->c_next_tx_seq; | ||
405 | cinfo->next_rx_seq = conn->c_next_rx_seq; | ||
406 | cinfo->laddr = conn->c_laddr; | ||
407 | cinfo->faddr = conn->c_faddr; | ||
408 | strncpy(cinfo->transport, conn->c_trans->t_name, | ||
409 | sizeof(cinfo->transport)); | ||
410 | cinfo->flags = 0; | ||
411 | |||
412 | rds_conn_info_set(cinfo->flags, | ||
413 | rds_conn_is_sending(conn), SENDING); | ||
414 | /* XXX Future: return the state rather than these funky bits */ | ||
415 | rds_conn_info_set(cinfo->flags, | ||
416 | atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, | ||
417 | CONNECTING); | ||
418 | rds_conn_info_set(cinfo->flags, | ||
419 | atomic_read(&conn->c_state) == RDS_CONN_UP, | ||
420 | CONNECTED); | ||
421 | return 1; | ||
422 | } | ||
423 | |||
424 | static void rds_conn_info(struct socket *sock, unsigned int len, | ||
425 | struct rds_info_iterator *iter, | ||
426 | struct rds_info_lengths *lens) | ||
427 | { | ||
428 | rds_for_each_conn_info(sock, len, iter, lens, | ||
429 | rds_conn_info_visitor, | ||
430 | sizeof(struct rds_info_connection)); | ||
431 | } | ||
432 | |||
433 | int __init rds_conn_init(void) | ||
434 | { | ||
435 | rds_conn_slab = kmem_cache_create("rds_connection", | ||
436 | sizeof(struct rds_connection), | ||
437 | 0, 0, NULL); | ||
438 | if (rds_conn_slab == NULL) | ||
439 | return -ENOMEM; | ||
440 | |||
441 | rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); | ||
442 | rds_info_register_func(RDS_INFO_SEND_MESSAGES, | ||
443 | rds_conn_message_info_send); | ||
444 | rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, | ||
445 | rds_conn_message_info_retrans); | ||
446 | |||
447 | return 0; | ||
448 | } | ||
449 | |||
450 | void rds_conn_exit(void) | ||
451 | { | ||
452 | rds_loop_exit(); | ||
453 | |||
454 | WARN_ON(!hlist_empty(rds_conn_hash)); | ||
455 | |||
456 | kmem_cache_destroy(rds_conn_slab); | ||
457 | |||
458 | rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); | ||
459 | rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, | ||
460 | rds_conn_message_info_send); | ||
461 | rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, | ||
462 | rds_conn_message_info_retrans); | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | * Force a disconnect | ||
467 | */ | ||
468 | void rds_conn_drop(struct rds_connection *conn) | ||
469 | { | ||
470 | atomic_set(&conn->c_state, RDS_CONN_ERROR); | ||
471 | queue_work(rds_wq, &conn->c_down_w); | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * An error occurred on the connection | ||
476 | */ | ||
477 | void | ||
478 | __rds_conn_error(struct rds_connection *conn, const char *fmt, ...) | ||
479 | { | ||
480 | va_list ap; | ||
481 | |||
482 | va_start(ap, fmt); | ||
483 | vprintk(fmt, ap); | ||
484 | va_end(ap); | ||
485 | |||
486 | rds_conn_drop(conn); | ||
487 | } | ||
diff --git a/net/rds/ib.c b/net/rds/ib.c new file mode 100644 index 000000000000..06a7b798d9a7 --- /dev/null +++ b/net/rds/ib.c | |||
@@ -0,0 +1,323 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <linux/if.h> | ||
36 | #include <linux/netdevice.h> | ||
37 | #include <linux/inetdevice.h> | ||
38 | #include <linux/if_arp.h> | ||
39 | #include <linux/delay.h> | ||
40 | |||
41 | #include "rds.h" | ||
42 | #include "ib.h" | ||
43 | |||
44 | unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; | ||
45 | unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ | ||
46 | |||
47 | module_param(fmr_pool_size, int, 0444); | ||
48 | MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); | ||
49 | module_param(fmr_message_size, int, 0444); | ||
50 | MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); | ||
51 | |||
52 | struct list_head rds_ib_devices; | ||
53 | |||
54 | DEFINE_SPINLOCK(ib_nodev_conns_lock); | ||
55 | LIST_HEAD(ib_nodev_conns); | ||
56 | |||
57 | void rds_ib_add_one(struct ib_device *device) | ||
58 | { | ||
59 | struct rds_ib_device *rds_ibdev; | ||
60 | struct ib_device_attr *dev_attr; | ||
61 | |||
62 | /* Only handle IB (no iWARP) devices */ | ||
63 | if (device->node_type != RDMA_NODE_IB_CA) | ||
64 | return; | ||
65 | |||
66 | dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); | ||
67 | if (!dev_attr) | ||
68 | return; | ||
69 | |||
70 | if (ib_query_device(device, dev_attr)) { | ||
71 | rdsdebug("Query device failed for %s\n", device->name); | ||
72 | goto free_attr; | ||
73 | } | ||
74 | |||
75 | rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); | ||
76 | if (!rds_ibdev) | ||
77 | goto free_attr; | ||
78 | |||
79 | spin_lock_init(&rds_ibdev->spinlock); | ||
80 | |||
81 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; | ||
82 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); | ||
83 | |||
84 | rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); | ||
85 | rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift; | ||
86 | rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1); | ||
87 | rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; | ||
88 | rds_ibdev->max_fmrs = dev_attr->max_fmr ? | ||
89 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : | ||
90 | fmr_pool_size; | ||
91 | |||
92 | rds_ibdev->dev = device; | ||
93 | rds_ibdev->pd = ib_alloc_pd(device); | ||
94 | if (IS_ERR(rds_ibdev->pd)) | ||
95 | goto free_dev; | ||
96 | |||
97 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, | ||
98 | IB_ACCESS_LOCAL_WRITE); | ||
99 | if (IS_ERR(rds_ibdev->mr)) | ||
100 | goto err_pd; | ||
101 | |||
102 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); | ||
103 | if (IS_ERR(rds_ibdev->mr_pool)) { | ||
104 | rds_ibdev->mr_pool = NULL; | ||
105 | goto err_mr; | ||
106 | } | ||
107 | |||
108 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); | ||
109 | INIT_LIST_HEAD(&rds_ibdev->conn_list); | ||
110 | list_add_tail(&rds_ibdev->list, &rds_ib_devices); | ||
111 | |||
112 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); | ||
113 | |||
114 | goto free_attr; | ||
115 | |||
116 | err_mr: | ||
117 | ib_dereg_mr(rds_ibdev->mr); | ||
118 | err_pd: | ||
119 | ib_dealloc_pd(rds_ibdev->pd); | ||
120 | free_dev: | ||
121 | kfree(rds_ibdev); | ||
122 | free_attr: | ||
123 | kfree(dev_attr); | ||
124 | } | ||
125 | |||
126 | void rds_ib_remove_one(struct ib_device *device) | ||
127 | { | ||
128 | struct rds_ib_device *rds_ibdev; | ||
129 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
130 | |||
131 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | ||
132 | if (!rds_ibdev) | ||
133 | return; | ||
134 | |||
135 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | ||
136 | list_del(&i_ipaddr->list); | ||
137 | kfree(i_ipaddr); | ||
138 | } | ||
139 | |||
140 | rds_ib_remove_conns(rds_ibdev); | ||
141 | |||
142 | if (rds_ibdev->mr_pool) | ||
143 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | ||
144 | |||
145 | ib_dereg_mr(rds_ibdev->mr); | ||
146 | |||
147 | while (ib_dealloc_pd(rds_ibdev->pd)) { | ||
148 | rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); | ||
149 | msleep(1); | ||
150 | } | ||
151 | |||
152 | list_del(&rds_ibdev->list); | ||
153 | kfree(rds_ibdev); | ||
154 | } | ||
155 | |||
156 | struct ib_client rds_ib_client = { | ||
157 | .name = "rds_ib", | ||
158 | .add = rds_ib_add_one, | ||
159 | .remove = rds_ib_remove_one | ||
160 | }; | ||
161 | |||
162 | static int rds_ib_conn_info_visitor(struct rds_connection *conn, | ||
163 | void *buffer) | ||
164 | { | ||
165 | struct rds_info_rdma_connection *iinfo = buffer; | ||
166 | struct rds_ib_connection *ic; | ||
167 | |||
168 | /* We will only ever look at IB transports */ | ||
169 | if (conn->c_trans != &rds_ib_transport) | ||
170 | return 0; | ||
171 | |||
172 | iinfo->src_addr = conn->c_laddr; | ||
173 | iinfo->dst_addr = conn->c_faddr; | ||
174 | |||
175 | memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); | ||
176 | memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); | ||
177 | if (rds_conn_state(conn) == RDS_CONN_UP) { | ||
178 | struct rds_ib_device *rds_ibdev; | ||
179 | struct rdma_dev_addr *dev_addr; | ||
180 | |||
181 | ic = conn->c_transport_data; | ||
182 | dev_addr = &ic->i_cm_id->route.addr.dev_addr; | ||
183 | |||
184 | ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); | ||
185 | ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); | ||
186 | |||
187 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||
188 | iinfo->max_send_wr = ic->i_send_ring.w_nr; | ||
189 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; | ||
190 | iinfo->max_send_sge = rds_ibdev->max_sge; | ||
191 | rds_ib_get_mr_info(rds_ibdev, iinfo); | ||
192 | } | ||
193 | return 1; | ||
194 | } | ||
195 | |||
196 | static void rds_ib_ic_info(struct socket *sock, unsigned int len, | ||
197 | struct rds_info_iterator *iter, | ||
198 | struct rds_info_lengths *lens) | ||
199 | { | ||
200 | rds_for_each_conn_info(sock, len, iter, lens, | ||
201 | rds_ib_conn_info_visitor, | ||
202 | sizeof(struct rds_info_rdma_connection)); | ||
203 | } | ||
204 | |||
205 | |||
206 | /* | ||
207 | * Early RDS/IB was built to only bind to an address if there is an IPoIB | ||
208 | * device with that address set. | ||
209 | * | ||
210 | * If it were me, I'd advocate for something more flexible. Sending and | ||
211 | * receiving should be device-agnostic. Transports would try and maintain | ||
212 | * connections between peers who have messages queued. Userspace would be | ||
213 | * allowed to influence which paths have priority. We could call userspace | ||
214 | * asserting this policy "routing". | ||
215 | */ | ||
216 | static int rds_ib_laddr_check(__be32 addr) | ||
217 | { | ||
218 | int ret; | ||
219 | struct rdma_cm_id *cm_id; | ||
220 | struct sockaddr_in sin; | ||
221 | |||
222 | /* Create a CMA ID and try to bind it. This catches both | ||
223 | * IB and iWARP capable NICs. | ||
224 | */ | ||
225 | cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); | ||
226 | if (!cm_id) | ||
227 | return -EADDRNOTAVAIL; | ||
228 | |||
229 | memset(&sin, 0, sizeof(sin)); | ||
230 | sin.sin_family = AF_INET; | ||
231 | sin.sin_addr.s_addr = addr; | ||
232 | |||
233 | /* rdma_bind_addr will only succeed for IB & iWARP devices */ | ||
234 | ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); | ||
235 | /* due to this, we will claim to support iWARP devices unless we | ||
236 | check node_type. */ | ||
237 | if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) | ||
238 | ret = -EADDRNOTAVAIL; | ||
239 | |||
240 | rdsdebug("addr %pI4 ret %d node type %d\n", | ||
241 | &addr, ret, | ||
242 | cm_id->device ? cm_id->device->node_type : -1); | ||
243 | |||
244 | rdma_destroy_id(cm_id); | ||
245 | |||
246 | return ret; | ||
247 | } | ||
248 | |||
249 | void rds_ib_exit(void) | ||
250 | { | ||
251 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | ||
252 | rds_ib_remove_nodev_conns(); | ||
253 | ib_unregister_client(&rds_ib_client); | ||
254 | rds_ib_sysctl_exit(); | ||
255 | rds_ib_recv_exit(); | ||
256 | rds_trans_unregister(&rds_ib_transport); | ||
257 | } | ||
258 | |||
259 | struct rds_transport rds_ib_transport = { | ||
260 | .laddr_check = rds_ib_laddr_check, | ||
261 | .xmit_complete = rds_ib_xmit_complete, | ||
262 | .xmit = rds_ib_xmit, | ||
263 | .xmit_cong_map = NULL, | ||
264 | .xmit_rdma = rds_ib_xmit_rdma, | ||
265 | .recv = rds_ib_recv, | ||
266 | .conn_alloc = rds_ib_conn_alloc, | ||
267 | .conn_free = rds_ib_conn_free, | ||
268 | .conn_connect = rds_ib_conn_connect, | ||
269 | .conn_shutdown = rds_ib_conn_shutdown, | ||
270 | .inc_copy_to_user = rds_ib_inc_copy_to_user, | ||
271 | .inc_purge = rds_ib_inc_purge, | ||
272 | .inc_free = rds_ib_inc_free, | ||
273 | .cm_initiate_connect = rds_ib_cm_initiate_connect, | ||
274 | .cm_handle_connect = rds_ib_cm_handle_connect, | ||
275 | .cm_connect_complete = rds_ib_cm_connect_complete, | ||
276 | .stats_info_copy = rds_ib_stats_info_copy, | ||
277 | .exit = rds_ib_exit, | ||
278 | .get_mr = rds_ib_get_mr, | ||
279 | .sync_mr = rds_ib_sync_mr, | ||
280 | .free_mr = rds_ib_free_mr, | ||
281 | .flush_mrs = rds_ib_flush_mrs, | ||
282 | .t_owner = THIS_MODULE, | ||
283 | .t_name = "infiniband", | ||
284 | }; | ||
285 | |||
286 | int __init rds_ib_init(void) | ||
287 | { | ||
288 | int ret; | ||
289 | |||
290 | INIT_LIST_HEAD(&rds_ib_devices); | ||
291 | |||
292 | ret = ib_register_client(&rds_ib_client); | ||
293 | if (ret) | ||
294 | goto out; | ||
295 | |||
296 | ret = rds_ib_sysctl_init(); | ||
297 | if (ret) | ||
298 | goto out_ibreg; | ||
299 | |||
300 | ret = rds_ib_recv_init(); | ||
301 | if (ret) | ||
302 | goto out_sysctl; | ||
303 | |||
304 | ret = rds_trans_register(&rds_ib_transport); | ||
305 | if (ret) | ||
306 | goto out_recv; | ||
307 | |||
308 | rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | ||
309 | |||
310 | goto out; | ||
311 | |||
312 | out_recv: | ||
313 | rds_ib_recv_exit(); | ||
314 | out_sysctl: | ||
315 | rds_ib_sysctl_exit(); | ||
316 | out_ibreg: | ||
317 | ib_unregister_client(&rds_ib_client); | ||
318 | out: | ||
319 | return ret; | ||
320 | } | ||
321 | |||
322 | MODULE_LICENSE("GPL"); | ||
323 | |||
diff --git a/net/rds/ib.h b/net/rds/ib.h new file mode 100644 index 000000000000..8be563a1363a --- /dev/null +++ b/net/rds/ib.h | |||
@@ -0,0 +1,367 @@ | |||
1 | #ifndef _RDS_IB_H | ||
2 | #define _RDS_IB_H | ||
3 | |||
4 | #include <rdma/ib_verbs.h> | ||
5 | #include <rdma/rdma_cm.h> | ||
6 | #include "rds.h" | ||
7 | #include "rdma_transport.h" | ||
8 | |||
9 | #define RDS_FMR_SIZE 256 | ||
10 | #define RDS_FMR_POOL_SIZE 4096 | ||
11 | |||
12 | #define RDS_IB_MAX_SGE 8 | ||
13 | #define RDS_IB_RECV_SGE 2 | ||
14 | |||
15 | #define RDS_IB_DEFAULT_RECV_WR 1024 | ||
16 | #define RDS_IB_DEFAULT_SEND_WR 256 | ||
17 | |||
18 | #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ | ||
19 | |||
20 | extern struct list_head rds_ib_devices; | ||
21 | |||
22 | /* | ||
23 | * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to | ||
24 | * try and minimize the amount of memory tied up both the device and | ||
25 | * socket receive queues. | ||
26 | */ | ||
27 | /* page offset of the final full frag that fits in the page */ | ||
28 | #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) | ||
29 | struct rds_page_frag { | ||
30 | struct list_head f_item; | ||
31 | struct page *f_page; | ||
32 | unsigned long f_offset; | ||
33 | dma_addr_t f_mapped; | ||
34 | }; | ||
35 | |||
36 | struct rds_ib_incoming { | ||
37 | struct list_head ii_frags; | ||
38 | struct rds_incoming ii_inc; | ||
39 | }; | ||
40 | |||
41 | struct rds_ib_connect_private { | ||
42 | /* Add new fields at the end, and don't permute existing fields. */ | ||
43 | __be32 dp_saddr; | ||
44 | __be32 dp_daddr; | ||
45 | u8 dp_protocol_major; | ||
46 | u8 dp_protocol_minor; | ||
47 | __be16 dp_protocol_minor_mask; /* bitmask */ | ||
48 | __be32 dp_reserved1; | ||
49 | __be64 dp_ack_seq; | ||
50 | __be32 dp_credit; /* non-zero enables flow ctl */ | ||
51 | }; | ||
52 | |||
53 | struct rds_ib_send_work { | ||
54 | struct rds_message *s_rm; | ||
55 | struct rds_rdma_op *s_op; | ||
56 | struct ib_send_wr s_wr; | ||
57 | struct ib_sge s_sge[RDS_IB_MAX_SGE]; | ||
58 | unsigned long s_queued; | ||
59 | }; | ||
60 | |||
61 | struct rds_ib_recv_work { | ||
62 | struct rds_ib_incoming *r_ibinc; | ||
63 | struct rds_page_frag *r_frag; | ||
64 | struct ib_recv_wr r_wr; | ||
65 | struct ib_sge r_sge[2]; | ||
66 | }; | ||
67 | |||
68 | struct rds_ib_work_ring { | ||
69 | u32 w_nr; | ||
70 | u32 w_alloc_ptr; | ||
71 | u32 w_alloc_ctr; | ||
72 | u32 w_free_ptr; | ||
73 | atomic_t w_free_ctr; | ||
74 | }; | ||
75 | |||
76 | struct rds_ib_device; | ||
77 | |||
78 | struct rds_ib_connection { | ||
79 | |||
80 | struct list_head ib_node; | ||
81 | struct rds_ib_device *rds_ibdev; | ||
82 | struct rds_connection *conn; | ||
83 | |||
84 | /* alphabet soup, IBTA style */ | ||
85 | struct rdma_cm_id *i_cm_id; | ||
86 | struct ib_pd *i_pd; | ||
87 | struct ib_mr *i_mr; | ||
88 | struct ib_cq *i_send_cq; | ||
89 | struct ib_cq *i_recv_cq; | ||
90 | |||
91 | /* tx */ | ||
92 | struct rds_ib_work_ring i_send_ring; | ||
93 | struct rds_message *i_rm; | ||
94 | struct rds_header *i_send_hdrs; | ||
95 | u64 i_send_hdrs_dma; | ||
96 | struct rds_ib_send_work *i_sends; | ||
97 | |||
98 | /* rx */ | ||
99 | struct mutex i_recv_mutex; | ||
100 | struct rds_ib_work_ring i_recv_ring; | ||
101 | struct rds_ib_incoming *i_ibinc; | ||
102 | u32 i_recv_data_rem; | ||
103 | struct rds_header *i_recv_hdrs; | ||
104 | u64 i_recv_hdrs_dma; | ||
105 | struct rds_ib_recv_work *i_recvs; | ||
106 | struct rds_page_frag i_frag; | ||
107 | u64 i_ack_recv; /* last ACK received */ | ||
108 | |||
109 | /* sending acks */ | ||
110 | unsigned long i_ack_flags; | ||
111 | u64 i_ack_next; /* next ACK to send */ | ||
112 | struct rds_header *i_ack; | ||
113 | struct ib_send_wr i_ack_wr; | ||
114 | struct ib_sge i_ack_sge; | ||
115 | u64 i_ack_dma; | ||
116 | unsigned long i_ack_queued; | ||
117 | |||
118 | /* Flow control related information | ||
119 | * | ||
120 | * Our algorithm uses a pair variables that we need to access | ||
121 | * atomically - one for the send credits, and one posted | ||
122 | * recv credits we need to transfer to remote. | ||
123 | * Rather than protect them using a slow spinlock, we put both into | ||
124 | * a single atomic_t and update it using cmpxchg | ||
125 | */ | ||
126 | atomic_t i_credits; | ||
127 | |||
128 | /* Protocol version specific information */ | ||
129 | unsigned int i_flowctl:1; /* enable/disable flow ctl */ | ||
130 | |||
131 | /* Batched completions */ | ||
132 | unsigned int i_unsignaled_wrs; | ||
133 | long i_unsignaled_bytes; | ||
134 | }; | ||
135 | |||
136 | /* This assumes that atomic_t is at least 32 bits */ | ||
137 | #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) | ||
138 | #define IB_GET_POST_CREDITS(v) ((v) >> 16) | ||
139 | #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) | ||
140 | #define IB_SET_POST_CREDITS(v) ((v) << 16) | ||
141 | |||
142 | struct rds_ib_ipaddr { | ||
143 | struct list_head list; | ||
144 | __be32 ipaddr; | ||
145 | }; | ||
146 | |||
147 | struct rds_ib_device { | ||
148 | struct list_head list; | ||
149 | struct list_head ipaddr_list; | ||
150 | struct list_head conn_list; | ||
151 | struct ib_device *dev; | ||
152 | struct ib_pd *pd; | ||
153 | struct ib_mr *mr; | ||
154 | struct rds_ib_mr_pool *mr_pool; | ||
155 | int fmr_page_shift; | ||
156 | int fmr_page_size; | ||
157 | u64 fmr_page_mask; | ||
158 | unsigned int fmr_max_remaps; | ||
159 | unsigned int max_fmrs; | ||
160 | int max_sge; | ||
161 | unsigned int max_wrs; | ||
162 | spinlock_t spinlock; /* protect the above */ | ||
163 | }; | ||
164 | |||
165 | /* bits for i_ack_flags */ | ||
166 | #define IB_ACK_IN_FLIGHT 0 | ||
167 | #define IB_ACK_REQUESTED 1 | ||
168 | |||
169 | /* Magic WR_ID for ACKs */ | ||
170 | #define RDS_IB_ACK_WR_ID (~(u64) 0) | ||
171 | |||
172 | struct rds_ib_statistics { | ||
173 | uint64_t s_ib_connect_raced; | ||
174 | uint64_t s_ib_listen_closed_stale; | ||
175 | uint64_t s_ib_tx_cq_call; | ||
176 | uint64_t s_ib_tx_cq_event; | ||
177 | uint64_t s_ib_tx_ring_full; | ||
178 | uint64_t s_ib_tx_throttle; | ||
179 | uint64_t s_ib_tx_sg_mapping_failure; | ||
180 | uint64_t s_ib_tx_stalled; | ||
181 | uint64_t s_ib_tx_credit_updates; | ||
182 | uint64_t s_ib_rx_cq_call; | ||
183 | uint64_t s_ib_rx_cq_event; | ||
184 | uint64_t s_ib_rx_ring_empty; | ||
185 | uint64_t s_ib_rx_refill_from_cq; | ||
186 | uint64_t s_ib_rx_refill_from_thread; | ||
187 | uint64_t s_ib_rx_alloc_limit; | ||
188 | uint64_t s_ib_rx_credit_updates; | ||
189 | uint64_t s_ib_ack_sent; | ||
190 | uint64_t s_ib_ack_send_failure; | ||
191 | uint64_t s_ib_ack_send_delayed; | ||
192 | uint64_t s_ib_ack_send_piggybacked; | ||
193 | uint64_t s_ib_ack_received; | ||
194 | uint64_t s_ib_rdma_mr_alloc; | ||
195 | uint64_t s_ib_rdma_mr_free; | ||
196 | uint64_t s_ib_rdma_mr_used; | ||
197 | uint64_t s_ib_rdma_mr_pool_flush; | ||
198 | uint64_t s_ib_rdma_mr_pool_wait; | ||
199 | uint64_t s_ib_rdma_mr_pool_depleted; | ||
200 | }; | ||
201 | |||
202 | extern struct workqueue_struct *rds_ib_wq; | ||
203 | |||
204 | /* | ||
205 | * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h | ||
206 | * doesn't define it. | ||
207 | */ | ||
208 | static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, | ||
209 | struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||
210 | { | ||
211 | unsigned int i; | ||
212 | |||
213 | for (i = 0; i < sg_dma_len; ++i) { | ||
214 | ib_dma_sync_single_for_cpu(dev, | ||
215 | ib_sg_dma_address(dev, &sg[i]), | ||
216 | ib_sg_dma_len(dev, &sg[i]), | ||
217 | direction); | ||
218 | } | ||
219 | } | ||
220 | #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu | ||
221 | |||
222 | static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, | ||
223 | struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||
224 | { | ||
225 | unsigned int i; | ||
226 | |||
227 | for (i = 0; i < sg_dma_len; ++i) { | ||
228 | ib_dma_sync_single_for_device(dev, | ||
229 | ib_sg_dma_address(dev, &sg[i]), | ||
230 | ib_sg_dma_len(dev, &sg[i]), | ||
231 | direction); | ||
232 | } | ||
233 | } | ||
234 | #define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device | ||
235 | |||
236 | |||
237 | /* ib.c */ | ||
238 | extern struct rds_transport rds_ib_transport; | ||
239 | extern void rds_ib_add_one(struct ib_device *device); | ||
240 | extern void rds_ib_remove_one(struct ib_device *device); | ||
241 | extern struct ib_client rds_ib_client; | ||
242 | |||
243 | extern unsigned int fmr_pool_size; | ||
244 | extern unsigned int fmr_message_size; | ||
245 | |||
246 | extern spinlock_t ib_nodev_conns_lock; | ||
247 | extern struct list_head ib_nodev_conns; | ||
248 | |||
249 | /* ib_cm.c */ | ||
250 | int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); | ||
251 | void rds_ib_conn_free(void *arg); | ||
252 | int rds_ib_conn_connect(struct rds_connection *conn); | ||
253 | void rds_ib_conn_shutdown(struct rds_connection *conn); | ||
254 | void rds_ib_state_change(struct sock *sk); | ||
255 | int __init rds_ib_listen_init(void); | ||
256 | void rds_ib_listen_stop(void); | ||
257 | void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); | ||
258 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | ||
259 | struct rdma_cm_event *event); | ||
260 | int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); | ||
261 | void rds_ib_cm_connect_complete(struct rds_connection *conn, | ||
262 | struct rdma_cm_event *event); | ||
263 | |||
264 | |||
265 | #define rds_ib_conn_error(conn, fmt...) \ | ||
266 | __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) | ||
267 | |||
268 | /* ib_rdma.c */ | ||
269 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); | ||
270 | int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); | ||
271 | void rds_ib_remove_nodev_conns(void); | ||
272 | void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev); | ||
273 | struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); | ||
274 | void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); | ||
275 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); | ||
276 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | ||
277 | struct rds_sock *rs, u32 *key_ret); | ||
278 | void rds_ib_sync_mr(void *trans_private, int dir); | ||
279 | void rds_ib_free_mr(void *trans_private, int invalidate); | ||
280 | void rds_ib_flush_mrs(void); | ||
281 | |||
282 | /* ib_recv.c */ | ||
283 | int __init rds_ib_recv_init(void); | ||
284 | void rds_ib_recv_exit(void); | ||
285 | int rds_ib_recv(struct rds_connection *conn); | ||
286 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | ||
287 | gfp_t page_gfp, int prefill); | ||
288 | void rds_ib_inc_purge(struct rds_incoming *inc); | ||
289 | void rds_ib_inc_free(struct rds_incoming *inc); | ||
290 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | ||
291 | size_t size); | ||
292 | void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); | ||
293 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic); | ||
294 | void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); | ||
295 | void rds_ib_recv_init_ack(struct rds_ib_connection *ic); | ||
296 | void rds_ib_attempt_ack(struct rds_ib_connection *ic); | ||
297 | void rds_ib_ack_send_complete(struct rds_ib_connection *ic); | ||
298 | u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); | ||
299 | |||
300 | /* ib_ring.c */ | ||
301 | void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); | ||
302 | void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); | ||
303 | u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); | ||
304 | void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); | ||
305 | void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); | ||
306 | int rds_ib_ring_empty(struct rds_ib_work_ring *ring); | ||
307 | int rds_ib_ring_low(struct rds_ib_work_ring *ring); | ||
308 | u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); | ||
309 | u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); | ||
310 | extern wait_queue_head_t rds_ib_ring_empty_wait; | ||
311 | |||
312 | /* ib_send.c */ | ||
313 | void rds_ib_xmit_complete(struct rds_connection *conn); | ||
314 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
315 | unsigned int hdr_off, unsigned int sg, unsigned int off); | ||
316 | void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); | ||
317 | void rds_ib_send_init_ring(struct rds_ib_connection *ic); | ||
318 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic); | ||
319 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | ||
320 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); | ||
321 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); | ||
322 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, | ||
323 | u32 *adv_credits, int need_posted); | ||
324 | |||
325 | /* ib_stats.c */ | ||
326 | DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); | ||
327 | #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) | ||
328 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | ||
329 | unsigned int avail); | ||
330 | |||
331 | /* ib_sysctl.c */ | ||
332 | int __init rds_ib_sysctl_init(void); | ||
333 | void rds_ib_sysctl_exit(void); | ||
334 | extern unsigned long rds_ib_sysctl_max_send_wr; | ||
335 | extern unsigned long rds_ib_sysctl_max_recv_wr; | ||
336 | extern unsigned long rds_ib_sysctl_max_unsig_wrs; | ||
337 | extern unsigned long rds_ib_sysctl_max_unsig_bytes; | ||
338 | extern unsigned long rds_ib_sysctl_max_recv_allocation; | ||
339 | extern unsigned int rds_ib_sysctl_flow_control; | ||
340 | extern ctl_table rds_ib_sysctl_table[]; | ||
341 | |||
342 | /* | ||
343 | * Helper functions for getting/setting the header and data SGEs in | ||
344 | * RDS packets (not RDMA) | ||
345 | */ | ||
346 | static inline struct ib_sge * | ||
347 | rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
348 | { | ||
349 | return &sge[0]; | ||
350 | } | ||
351 | |||
352 | static inline struct ib_sge * | ||
353 | rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
354 | { | ||
355 | return &sge[1]; | ||
356 | } | ||
357 | |||
358 | static inline void rds_ib_set_64bit(u64 *ptr, u64 val) | ||
359 | { | ||
360 | #if BITS_PER_LONG == 64 | ||
361 | *ptr = val; | ||
362 | #else | ||
363 | set_64bit(ptr, val); | ||
364 | #endif | ||
365 | } | ||
366 | |||
367 | #endif | ||
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c new file mode 100644 index 000000000000..0532237bd128 --- /dev/null +++ b/net/rds/ib_cm.c | |||
@@ -0,0 +1,726 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <linux/vmalloc.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "ib.h" | ||
39 | |||
40 | /* | ||
41 | * Set the selected protocol version | ||
42 | */ | ||
43 | static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version) | ||
44 | { | ||
45 | conn->c_version = version; | ||
46 | } | ||
47 | |||
48 | /* | ||
49 | * Set up flow control | ||
50 | */ | ||
51 | static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits) | ||
52 | { | ||
53 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
54 | |||
55 | if (rds_ib_sysctl_flow_control && credits != 0) { | ||
56 | /* We're doing flow control */ | ||
57 | ic->i_flowctl = 1; | ||
58 | rds_ib_send_add_credits(conn, credits); | ||
59 | } else { | ||
60 | ic->i_flowctl = 0; | ||
61 | } | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * Tune RNR behavior. Without flow control, we use a rather | ||
66 | * low timeout, but not the absolute minimum - this should | ||
67 | * be tunable. | ||
68 | * | ||
69 | * We already set the RNR retry count to 7 (which is the | ||
70 | * smallest infinite number :-) above. | ||
71 | * If flow control is off, we want to change this back to 0 | ||
72 | * so that we learn quickly when our credit accounting is | ||
73 | * buggy. | ||
74 | * | ||
75 | * Caller passes in a qp_attr pointer - don't waste stack spacv | ||
76 | * by allocation this twice. | ||
77 | */ | ||
78 | static void | ||
79 | rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) | ||
80 | { | ||
81 | int ret; | ||
82 | |||
83 | attr->min_rnr_timer = IB_RNR_TIMER_000_32; | ||
84 | ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); | ||
85 | if (ret) | ||
86 | printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * Connection established. | ||
91 | * We get here for both outgoing and incoming connection. | ||
92 | */ | ||
93 | void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) | ||
94 | { | ||
95 | const struct rds_ib_connect_private *dp = NULL; | ||
96 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
97 | struct rds_ib_device *rds_ibdev; | ||
98 | struct ib_qp_attr qp_attr; | ||
99 | int err; | ||
100 | |||
101 | if (event->param.conn.private_data_len) { | ||
102 | dp = event->param.conn.private_data; | ||
103 | |||
104 | rds_ib_set_protocol(conn, | ||
105 | RDS_PROTOCOL(dp->dp_protocol_major, | ||
106 | dp->dp_protocol_minor)); | ||
107 | rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); | ||
108 | } | ||
109 | |||
110 | printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", | ||
111 | &conn->c_laddr, | ||
112 | RDS_PROTOCOL_MAJOR(conn->c_version), | ||
113 | RDS_PROTOCOL_MINOR(conn->c_version), | ||
114 | ic->i_flowctl ? ", flow control" : ""); | ||
115 | |||
116 | /* Tune RNR behavior */ | ||
117 | rds_ib_tune_rnr(ic, &qp_attr); | ||
118 | |||
119 | qp_attr.qp_state = IB_QPS_RTS; | ||
120 | err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); | ||
121 | if (err) | ||
122 | printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); | ||
123 | |||
124 | /* update ib_device with this local ipaddr & conn */ | ||
125 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||
126 | err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); | ||
127 | if (err) | ||
128 | printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); | ||
129 | err = rds_ib_add_conn(rds_ibdev, conn); | ||
130 | if (err) | ||
131 | printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err); | ||
132 | |||
133 | /* If the peer gave us the last packet it saw, process this as if | ||
134 | * we had received a regular ACK. */ | ||
135 | if (dp && dp->dp_ack_seq) | ||
136 | rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); | ||
137 | |||
138 | rds_connect_complete(conn); | ||
139 | } | ||
140 | |||
141 | static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, | ||
142 | struct rdma_conn_param *conn_param, | ||
143 | struct rds_ib_connect_private *dp, | ||
144 | u32 protocol_version) | ||
145 | { | ||
146 | memset(conn_param, 0, sizeof(struct rdma_conn_param)); | ||
147 | /* XXX tune these? */ | ||
148 | conn_param->responder_resources = 1; | ||
149 | conn_param->initiator_depth = 1; | ||
150 | conn_param->retry_count = 7; | ||
151 | conn_param->rnr_retry_count = 7; | ||
152 | |||
153 | if (dp) { | ||
154 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
155 | |||
156 | memset(dp, 0, sizeof(*dp)); | ||
157 | dp->dp_saddr = conn->c_laddr; | ||
158 | dp->dp_daddr = conn->c_faddr; | ||
159 | dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); | ||
160 | dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); | ||
161 | dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); | ||
162 | dp->dp_ack_seq = rds_ib_piggyb_ack(ic); | ||
163 | |||
164 | /* Advertise flow control */ | ||
165 | if (ic->i_flowctl) { | ||
166 | unsigned int credits; | ||
167 | |||
168 | credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); | ||
169 | dp->dp_credit = cpu_to_be32(credits); | ||
170 | atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); | ||
171 | } | ||
172 | |||
173 | conn_param->private_data = dp; | ||
174 | conn_param->private_data_len = sizeof(*dp); | ||
175 | } | ||
176 | } | ||
177 | |||
178 | static void rds_ib_cq_event_handler(struct ib_event *event, void *data) | ||
179 | { | ||
180 | rdsdebug("event %u data %p\n", event->event, data); | ||
181 | } | ||
182 | |||
183 | static void rds_ib_qp_event_handler(struct ib_event *event, void *data) | ||
184 | { | ||
185 | struct rds_connection *conn = data; | ||
186 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
187 | |||
188 | rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); | ||
189 | |||
190 | switch (event->event) { | ||
191 | case IB_EVENT_COMM_EST: | ||
192 | rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); | ||
193 | break; | ||
194 | default: | ||
195 | printk(KERN_WARNING "RDS/ib: unhandled QP event %u " | ||
196 | "on connection to %pI4\n", event->event, | ||
197 | &conn->c_faddr); | ||
198 | break; | ||
199 | } | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * This needs to be very careful to not leave IS_ERR pointers around for | ||
204 | * cleanup to trip over. | ||
205 | */ | ||
206 | static int rds_ib_setup_qp(struct rds_connection *conn) | ||
207 | { | ||
208 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
209 | struct ib_device *dev = ic->i_cm_id->device; | ||
210 | struct ib_qp_init_attr attr; | ||
211 | struct rds_ib_device *rds_ibdev; | ||
212 | int ret; | ||
213 | |||
214 | /* rds_ib_add_one creates a rds_ib_device object per IB device, | ||
215 | * and allocates a protection domain, memory range and FMR pool | ||
216 | * for each. If that fails for any reason, it will not register | ||
217 | * the rds_ibdev at all. | ||
218 | */ | ||
219 | rds_ibdev = ib_get_client_data(dev, &rds_ib_client); | ||
220 | if (rds_ibdev == NULL) { | ||
221 | if (printk_ratelimit()) | ||
222 | printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", | ||
223 | dev->name); | ||
224 | return -EOPNOTSUPP; | ||
225 | } | ||
226 | |||
227 | if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) | ||
228 | rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); | ||
229 | if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) | ||
230 | rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); | ||
231 | |||
232 | /* Protection domain and memory range */ | ||
233 | ic->i_pd = rds_ibdev->pd; | ||
234 | ic->i_mr = rds_ibdev->mr; | ||
235 | |||
236 | ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, | ||
237 | rds_ib_cq_event_handler, conn, | ||
238 | ic->i_send_ring.w_nr + 1, 0); | ||
239 | if (IS_ERR(ic->i_send_cq)) { | ||
240 | ret = PTR_ERR(ic->i_send_cq); | ||
241 | ic->i_send_cq = NULL; | ||
242 | rdsdebug("ib_create_cq send failed: %d\n", ret); | ||
243 | goto out; | ||
244 | } | ||
245 | |||
246 | ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, | ||
247 | rds_ib_cq_event_handler, conn, | ||
248 | ic->i_recv_ring.w_nr, 0); | ||
249 | if (IS_ERR(ic->i_recv_cq)) { | ||
250 | ret = PTR_ERR(ic->i_recv_cq); | ||
251 | ic->i_recv_cq = NULL; | ||
252 | rdsdebug("ib_create_cq recv failed: %d\n", ret); | ||
253 | goto out; | ||
254 | } | ||
255 | |||
256 | ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); | ||
257 | if (ret) { | ||
258 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | ||
259 | goto out; | ||
260 | } | ||
261 | |||
262 | ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); | ||
263 | if (ret) { | ||
264 | rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); | ||
265 | goto out; | ||
266 | } | ||
267 | |||
268 | /* XXX negotiate max send/recv with remote? */ | ||
269 | memset(&attr, 0, sizeof(attr)); | ||
270 | attr.event_handler = rds_ib_qp_event_handler; | ||
271 | attr.qp_context = conn; | ||
272 | /* + 1 to allow for the single ack message */ | ||
273 | attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; | ||
274 | attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; | ||
275 | attr.cap.max_send_sge = rds_ibdev->max_sge; | ||
276 | attr.cap.max_recv_sge = RDS_IB_RECV_SGE; | ||
277 | attr.sq_sig_type = IB_SIGNAL_REQ_WR; | ||
278 | attr.qp_type = IB_QPT_RC; | ||
279 | attr.send_cq = ic->i_send_cq; | ||
280 | attr.recv_cq = ic->i_recv_cq; | ||
281 | |||
282 | /* | ||
283 | * XXX this can fail if max_*_wr is too large? Are we supposed | ||
284 | * to back off until we get a value that the hardware can support? | ||
285 | */ | ||
286 | ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); | ||
287 | if (ret) { | ||
288 | rdsdebug("rdma_create_qp failed: %d\n", ret); | ||
289 | goto out; | ||
290 | } | ||
291 | |||
292 | ic->i_send_hdrs = ib_dma_alloc_coherent(dev, | ||
293 | ic->i_send_ring.w_nr * | ||
294 | sizeof(struct rds_header), | ||
295 | &ic->i_send_hdrs_dma, GFP_KERNEL); | ||
296 | if (ic->i_send_hdrs == NULL) { | ||
297 | ret = -ENOMEM; | ||
298 | rdsdebug("ib_dma_alloc_coherent send failed\n"); | ||
299 | goto out; | ||
300 | } | ||
301 | |||
302 | ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, | ||
303 | ic->i_recv_ring.w_nr * | ||
304 | sizeof(struct rds_header), | ||
305 | &ic->i_recv_hdrs_dma, GFP_KERNEL); | ||
306 | if (ic->i_recv_hdrs == NULL) { | ||
307 | ret = -ENOMEM; | ||
308 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); | ||
309 | goto out; | ||
310 | } | ||
311 | |||
312 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), | ||
313 | &ic->i_ack_dma, GFP_KERNEL); | ||
314 | if (ic->i_ack == NULL) { | ||
315 | ret = -ENOMEM; | ||
316 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); | ||
317 | goto out; | ||
318 | } | ||
319 | |||
320 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); | ||
321 | if (ic->i_sends == NULL) { | ||
322 | ret = -ENOMEM; | ||
323 | rdsdebug("send allocation failed\n"); | ||
324 | goto out; | ||
325 | } | ||
326 | rds_ib_send_init_ring(ic); | ||
327 | |||
328 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); | ||
329 | if (ic->i_recvs == NULL) { | ||
330 | ret = -ENOMEM; | ||
331 | rdsdebug("recv allocation failed\n"); | ||
332 | goto out; | ||
333 | } | ||
334 | |||
335 | rds_ib_recv_init_ring(ic); | ||
336 | rds_ib_recv_init_ack(ic); | ||
337 | |||
338 | /* Post receive buffers - as a side effect, this will update | ||
339 | * the posted credit count. */ | ||
340 | rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); | ||
341 | |||
342 | rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, | ||
343 | ic->i_send_cq, ic->i_recv_cq); | ||
344 | |||
345 | out: | ||
346 | return ret; | ||
347 | } | ||
348 | |||
349 | static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp) | ||
350 | { | ||
351 | u16 common; | ||
352 | u32 version = 0; | ||
353 | |||
354 | /* rdma_cm private data is odd - when there is any private data in the | ||
355 | * request, we will be given a pretty large buffer without telling us the | ||
356 | * original size. The only way to tell the difference is by looking at | ||
357 | * the contents, which are initialized to zero. | ||
358 | * If the protocol version fields aren't set, this is a connection attempt | ||
359 | * from an older version. This could could be 3.0 or 2.0 - we can't tell. | ||
360 | * We really should have changed this for OFED 1.3 :-( */ | ||
361 | if (dp->dp_protocol_major == 0) | ||
362 | return RDS_PROTOCOL_3_0; | ||
363 | |||
364 | common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; | ||
365 | if (dp->dp_protocol_major == 3 && common) { | ||
366 | version = RDS_PROTOCOL_3_0; | ||
367 | while ((common >>= 1) != 0) | ||
368 | version++; | ||
369 | } else if (printk_ratelimit()) { | ||
370 | printk(KERN_NOTICE "RDS: Connection from %pI4 using " | ||
371 | "incompatible protocol version %u.%u\n", | ||
372 | &dp->dp_saddr, | ||
373 | dp->dp_protocol_major, | ||
374 | dp->dp_protocol_minor); | ||
375 | } | ||
376 | return version; | ||
377 | } | ||
378 | |||
379 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | ||
380 | struct rdma_cm_event *event) | ||
381 | { | ||
382 | __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; | ||
383 | __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; | ||
384 | const struct rds_ib_connect_private *dp = event->param.conn.private_data; | ||
385 | struct rds_ib_connect_private dp_rep; | ||
386 | struct rds_connection *conn = NULL; | ||
387 | struct rds_ib_connection *ic = NULL; | ||
388 | struct rdma_conn_param conn_param; | ||
389 | u32 version; | ||
390 | int err, destroy = 1; | ||
391 | |||
392 | /* Check whether the remote protocol version matches ours. */ | ||
393 | version = rds_ib_protocol_compatible(dp); | ||
394 | if (!version) | ||
395 | goto out; | ||
396 | |||
397 | rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " | ||
398 | "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, | ||
399 | RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), | ||
400 | (unsigned long long)be64_to_cpu(lguid), | ||
401 | (unsigned long long)be64_to_cpu(fguid)); | ||
402 | |||
403 | conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, | ||
404 | GFP_KERNEL); | ||
405 | if (IS_ERR(conn)) { | ||
406 | rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); | ||
407 | conn = NULL; | ||
408 | goto out; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * The connection request may occur while the | ||
413 | * previous connection exist, e.g. in case of failover. | ||
414 | * But as connections may be initiated simultaneously | ||
415 | * by both hosts, we have a random backoff mechanism - | ||
416 | * see the comment above rds_queue_reconnect() | ||
417 | */ | ||
418 | mutex_lock(&conn->c_cm_lock); | ||
419 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { | ||
420 | if (rds_conn_state(conn) == RDS_CONN_UP) { | ||
421 | rdsdebug("incoming connect while connecting\n"); | ||
422 | rds_conn_drop(conn); | ||
423 | rds_ib_stats_inc(s_ib_listen_closed_stale); | ||
424 | } else | ||
425 | if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { | ||
426 | /* Wait and see - our connect may still be succeeding */ | ||
427 | rds_ib_stats_inc(s_ib_connect_raced); | ||
428 | } | ||
429 | mutex_unlock(&conn->c_cm_lock); | ||
430 | goto out; | ||
431 | } | ||
432 | |||
433 | ic = conn->c_transport_data; | ||
434 | |||
435 | rds_ib_set_protocol(conn, version); | ||
436 | rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); | ||
437 | |||
438 | /* If the peer gave us the last packet it saw, process this as if | ||
439 | * we had received a regular ACK. */ | ||
440 | if (dp->dp_ack_seq) | ||
441 | rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); | ||
442 | |||
443 | BUG_ON(cm_id->context); | ||
444 | BUG_ON(ic->i_cm_id); | ||
445 | |||
446 | ic->i_cm_id = cm_id; | ||
447 | cm_id->context = conn; | ||
448 | |||
449 | /* We got halfway through setting up the ib_connection, if we | ||
450 | * fail now, we have to take the long route out of this mess. */ | ||
451 | destroy = 0; | ||
452 | |||
453 | err = rds_ib_setup_qp(conn); | ||
454 | if (err) { | ||
455 | rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); | ||
456 | goto out; | ||
457 | } | ||
458 | |||
459 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); | ||
460 | |||
461 | /* rdma_accept() calls rdma_reject() internally if it fails */ | ||
462 | err = rdma_accept(cm_id, &conn_param); | ||
463 | mutex_unlock(&conn->c_cm_lock); | ||
464 | if (err) { | ||
465 | rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); | ||
466 | goto out; | ||
467 | } | ||
468 | |||
469 | return 0; | ||
470 | |||
471 | out: | ||
472 | rdma_reject(cm_id, NULL, 0); | ||
473 | return destroy; | ||
474 | } | ||
475 | |||
476 | |||
477 | int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) | ||
478 | { | ||
479 | struct rds_connection *conn = cm_id->context; | ||
480 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
481 | struct rdma_conn_param conn_param; | ||
482 | struct rds_ib_connect_private dp; | ||
483 | int ret; | ||
484 | |||
485 | /* If the peer doesn't do protocol negotiation, we must | ||
486 | * default to RDSv3.0 */ | ||
487 | rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); | ||
488 | ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ | ||
489 | |||
490 | ret = rds_ib_setup_qp(conn); | ||
491 | if (ret) { | ||
492 | rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); | ||
493 | goto out; | ||
494 | } | ||
495 | |||
496 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); | ||
497 | |||
498 | ret = rdma_connect(cm_id, &conn_param); | ||
499 | if (ret) | ||
500 | rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); | ||
501 | |||
502 | out: | ||
503 | /* Beware - returning non-zero tells the rdma_cm to destroy | ||
504 | * the cm_id. We should certainly not do it as long as we still | ||
505 | * "own" the cm_id. */ | ||
506 | if (ret) { | ||
507 | if (ic->i_cm_id == cm_id) | ||
508 | ret = 0; | ||
509 | } | ||
510 | return ret; | ||
511 | } | ||
512 | |||
513 | int rds_ib_conn_connect(struct rds_connection *conn) | ||
514 | { | ||
515 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
516 | struct sockaddr_in src, dest; | ||
517 | int ret; | ||
518 | |||
519 | /* XXX I wonder what affect the port space has */ | ||
520 | /* delegate cm event handler to rdma_transport */ | ||
521 | ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, | ||
522 | RDMA_PS_TCP); | ||
523 | if (IS_ERR(ic->i_cm_id)) { | ||
524 | ret = PTR_ERR(ic->i_cm_id); | ||
525 | ic->i_cm_id = NULL; | ||
526 | rdsdebug("rdma_create_id() failed: %d\n", ret); | ||
527 | goto out; | ||
528 | } | ||
529 | |||
530 | rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); | ||
531 | |||
532 | src.sin_family = AF_INET; | ||
533 | src.sin_addr.s_addr = (__force u32)conn->c_laddr; | ||
534 | src.sin_port = (__force u16)htons(0); | ||
535 | |||
536 | dest.sin_family = AF_INET; | ||
537 | dest.sin_addr.s_addr = (__force u32)conn->c_faddr; | ||
538 | dest.sin_port = (__force u16)htons(RDS_PORT); | ||
539 | |||
540 | ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, | ||
541 | (struct sockaddr *)&dest, | ||
542 | RDS_RDMA_RESOLVE_TIMEOUT_MS); | ||
543 | if (ret) { | ||
544 | rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, | ||
545 | ret); | ||
546 | rdma_destroy_id(ic->i_cm_id); | ||
547 | ic->i_cm_id = NULL; | ||
548 | } | ||
549 | |||
550 | out: | ||
551 | return ret; | ||
552 | } | ||
553 | |||
554 | /* | ||
555 | * This is so careful about only cleaning up resources that were built up | ||
556 | * so that it can be called at any point during startup. In fact it | ||
557 | * can be called multiple times for a given connection. | ||
558 | */ | ||
559 | void rds_ib_conn_shutdown(struct rds_connection *conn) | ||
560 | { | ||
561 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
562 | int err = 0; | ||
563 | |||
564 | rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, | ||
565 | ic->i_pd, ic->i_send_cq, ic->i_recv_cq, | ||
566 | ic->i_cm_id ? ic->i_cm_id->qp : NULL); | ||
567 | |||
568 | if (ic->i_cm_id) { | ||
569 | struct ib_device *dev = ic->i_cm_id->device; | ||
570 | |||
571 | rdsdebug("disconnecting cm %p\n", ic->i_cm_id); | ||
572 | err = rdma_disconnect(ic->i_cm_id); | ||
573 | if (err) { | ||
574 | /* Actually this may happen quite frequently, when | ||
575 | * an outgoing connect raced with an incoming connect. | ||
576 | */ | ||
577 | rdsdebug("failed to disconnect, cm: %p err %d\n", | ||
578 | ic->i_cm_id, err); | ||
579 | } | ||
580 | |||
581 | wait_event(rds_ib_ring_empty_wait, | ||
582 | rds_ib_ring_empty(&ic->i_send_ring) && | ||
583 | rds_ib_ring_empty(&ic->i_recv_ring)); | ||
584 | |||
585 | if (ic->i_send_hdrs) | ||
586 | ib_dma_free_coherent(dev, | ||
587 | ic->i_send_ring.w_nr * | ||
588 | sizeof(struct rds_header), | ||
589 | ic->i_send_hdrs, | ||
590 | ic->i_send_hdrs_dma); | ||
591 | |||
592 | if (ic->i_recv_hdrs) | ||
593 | ib_dma_free_coherent(dev, | ||
594 | ic->i_recv_ring.w_nr * | ||
595 | sizeof(struct rds_header), | ||
596 | ic->i_recv_hdrs, | ||
597 | ic->i_recv_hdrs_dma); | ||
598 | |||
599 | if (ic->i_ack) | ||
600 | ib_dma_free_coherent(dev, sizeof(struct rds_header), | ||
601 | ic->i_ack, ic->i_ack_dma); | ||
602 | |||
603 | if (ic->i_sends) | ||
604 | rds_ib_send_clear_ring(ic); | ||
605 | if (ic->i_recvs) | ||
606 | rds_ib_recv_clear_ring(ic); | ||
607 | |||
608 | if (ic->i_cm_id->qp) | ||
609 | rdma_destroy_qp(ic->i_cm_id); | ||
610 | if (ic->i_send_cq) | ||
611 | ib_destroy_cq(ic->i_send_cq); | ||
612 | if (ic->i_recv_cq) | ||
613 | ib_destroy_cq(ic->i_recv_cq); | ||
614 | rdma_destroy_id(ic->i_cm_id); | ||
615 | |||
616 | /* | ||
617 | * Move connection back to the nodev list. | ||
618 | */ | ||
619 | if (ic->rds_ibdev) { | ||
620 | |||
621 | spin_lock_irq(&ic->rds_ibdev->spinlock); | ||
622 | BUG_ON(list_empty(&ic->ib_node)); | ||
623 | list_del(&ic->ib_node); | ||
624 | spin_unlock_irq(&ic->rds_ibdev->spinlock); | ||
625 | |||
626 | spin_lock_irq(&ib_nodev_conns_lock); | ||
627 | list_add_tail(&ic->ib_node, &ib_nodev_conns); | ||
628 | spin_unlock_irq(&ib_nodev_conns_lock); | ||
629 | ic->rds_ibdev = NULL; | ||
630 | } | ||
631 | |||
632 | ic->i_cm_id = NULL; | ||
633 | ic->i_pd = NULL; | ||
634 | ic->i_mr = NULL; | ||
635 | ic->i_send_cq = NULL; | ||
636 | ic->i_recv_cq = NULL; | ||
637 | ic->i_send_hdrs = NULL; | ||
638 | ic->i_recv_hdrs = NULL; | ||
639 | ic->i_ack = NULL; | ||
640 | } | ||
641 | BUG_ON(ic->rds_ibdev); | ||
642 | |||
643 | /* Clear pending transmit */ | ||
644 | if (ic->i_rm) { | ||
645 | rds_message_put(ic->i_rm); | ||
646 | ic->i_rm = NULL; | ||
647 | } | ||
648 | |||
649 | /* Clear the ACK state */ | ||
650 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
651 | rds_ib_set_64bit(&ic->i_ack_next, 0); | ||
652 | ic->i_ack_recv = 0; | ||
653 | |||
654 | /* Clear flow control state */ | ||
655 | ic->i_flowctl = 0; | ||
656 | atomic_set(&ic->i_credits, 0); | ||
657 | |||
658 | rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); | ||
659 | rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); | ||
660 | |||
661 | if (ic->i_ibinc) { | ||
662 | rds_inc_put(&ic->i_ibinc->ii_inc); | ||
663 | ic->i_ibinc = NULL; | ||
664 | } | ||
665 | |||
666 | vfree(ic->i_sends); | ||
667 | ic->i_sends = NULL; | ||
668 | vfree(ic->i_recvs); | ||
669 | ic->i_recvs = NULL; | ||
670 | } | ||
671 | |||
672 | int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) | ||
673 | { | ||
674 | struct rds_ib_connection *ic; | ||
675 | unsigned long flags; | ||
676 | |||
677 | /* XXX too lazy? */ | ||
678 | ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); | ||
679 | if (ic == NULL) | ||
680 | return -ENOMEM; | ||
681 | |||
682 | INIT_LIST_HEAD(&ic->ib_node); | ||
683 | mutex_init(&ic->i_recv_mutex); | ||
684 | |||
685 | /* | ||
686 | * rds_ib_conn_shutdown() waits for these to be emptied so they | ||
687 | * must be initialized before it can be called. | ||
688 | */ | ||
689 | rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); | ||
690 | rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); | ||
691 | |||
692 | ic->conn = conn; | ||
693 | conn->c_transport_data = ic; | ||
694 | |||
695 | spin_lock_irqsave(&ib_nodev_conns_lock, flags); | ||
696 | list_add_tail(&ic->ib_node, &ib_nodev_conns); | ||
697 | spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); | ||
698 | |||
699 | |||
700 | rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); | ||
701 | return 0; | ||
702 | } | ||
703 | |||
704 | void rds_ib_conn_free(void *arg) | ||
705 | { | ||
706 | struct rds_ib_connection *ic = arg; | ||
707 | rdsdebug("ic %p\n", ic); | ||
708 | list_del(&ic->ib_node); | ||
709 | kfree(ic); | ||
710 | } | ||
711 | |||
712 | |||
713 | /* | ||
714 | * An error occurred on the connection | ||
715 | */ | ||
716 | void | ||
717 | __rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...) | ||
718 | { | ||
719 | va_list ap; | ||
720 | |||
721 | rds_conn_drop(conn); | ||
722 | |||
723 | va_start(ap, fmt); | ||
724 | vprintk(fmt, ap); | ||
725 | va_end(ap); | ||
726 | } | ||
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c new file mode 100644 index 000000000000..69a6289ed672 --- /dev/null +++ b/net/rds/ib_rdma.c | |||
@@ -0,0 +1,641 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | #include "rdma.h" | ||
37 | #include "ib.h" | ||
38 | |||
39 | |||
40 | /* | ||
41 | * This is stored as mr->r_trans_private. | ||
42 | */ | ||
43 | struct rds_ib_mr { | ||
44 | struct rds_ib_device *device; | ||
45 | struct rds_ib_mr_pool *pool; | ||
46 | struct ib_fmr *fmr; | ||
47 | struct list_head list; | ||
48 | unsigned int remap_count; | ||
49 | |||
50 | struct scatterlist *sg; | ||
51 | unsigned int sg_len; | ||
52 | u64 *dma; | ||
53 | int sg_dma_len; | ||
54 | }; | ||
55 | |||
56 | /* | ||
57 | * Our own little FMR pool | ||
58 | */ | ||
59 | struct rds_ib_mr_pool { | ||
60 | struct mutex flush_lock; /* serialize fmr invalidate */ | ||
61 | struct work_struct flush_worker; /* flush worker */ | ||
62 | |||
63 | spinlock_t list_lock; /* protect variables below */ | ||
64 | atomic_t item_count; /* total # of MRs */ | ||
65 | atomic_t dirty_count; /* # dirty of MRs */ | ||
66 | struct list_head drop_list; /* MRs that have reached their max_maps limit */ | ||
67 | struct list_head free_list; /* unused MRs */ | ||
68 | struct list_head clean_list; /* unused & unamapped MRs */ | ||
69 | atomic_t free_pinned; /* memory pinned by free MRs */ | ||
70 | unsigned long max_items; | ||
71 | unsigned long max_items_soft; | ||
72 | unsigned long max_free_pinned; | ||
73 | struct ib_fmr_attr fmr_attr; | ||
74 | }; | ||
75 | |||
76 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); | ||
77 | static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); | ||
78 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work); | ||
79 | |||
80 | static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) | ||
81 | { | ||
82 | struct rds_ib_device *rds_ibdev; | ||
83 | struct rds_ib_ipaddr *i_ipaddr; | ||
84 | |||
85 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { | ||
86 | spin_lock_irq(&rds_ibdev->spinlock); | ||
87 | list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { | ||
88 | if (i_ipaddr->ipaddr == ipaddr) { | ||
89 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
90 | return rds_ibdev; | ||
91 | } | ||
92 | } | ||
93 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
94 | } | ||
95 | |||
96 | return NULL; | ||
97 | } | ||
98 | |||
99 | static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | ||
100 | { | ||
101 | struct rds_ib_ipaddr *i_ipaddr; | ||
102 | |||
103 | i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); | ||
104 | if (!i_ipaddr) | ||
105 | return -ENOMEM; | ||
106 | |||
107 | i_ipaddr->ipaddr = ipaddr; | ||
108 | |||
109 | spin_lock_irq(&rds_ibdev->spinlock); | ||
110 | list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); | ||
111 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
112 | |||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | ||
117 | { | ||
118 | struct rds_ib_ipaddr *i_ipaddr, *next; | ||
119 | |||
120 | spin_lock_irq(&rds_ibdev->spinlock); | ||
121 | list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { | ||
122 | if (i_ipaddr->ipaddr == ipaddr) { | ||
123 | list_del(&i_ipaddr->list); | ||
124 | kfree(i_ipaddr); | ||
125 | break; | ||
126 | } | ||
127 | } | ||
128 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
129 | } | ||
130 | |||
131 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | ||
132 | { | ||
133 | struct rds_ib_device *rds_ibdev_old; | ||
134 | |||
135 | rds_ibdev_old = rds_ib_get_device(ipaddr); | ||
136 | if (rds_ibdev_old) | ||
137 | rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); | ||
138 | |||
139 | return rds_ib_add_ipaddr(rds_ibdev, ipaddr); | ||
140 | } | ||
141 | |||
142 | int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) | ||
143 | { | ||
144 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
145 | |||
146 | /* conn was previously on the nodev_conns_list */ | ||
147 | spin_lock_irq(&ib_nodev_conns_lock); | ||
148 | BUG_ON(list_empty(&ib_nodev_conns)); | ||
149 | BUG_ON(list_empty(&ic->ib_node)); | ||
150 | list_del(&ic->ib_node); | ||
151 | spin_unlock_irq(&ib_nodev_conns_lock); | ||
152 | |||
153 | spin_lock_irq(&rds_ibdev->spinlock); | ||
154 | list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); | ||
155 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
156 | |||
157 | ic->rds_ibdev = rds_ibdev; | ||
158 | |||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | void rds_ib_remove_nodev_conns(void) | ||
163 | { | ||
164 | struct rds_ib_connection *ic, *_ic; | ||
165 | LIST_HEAD(tmp_list); | ||
166 | |||
167 | /* avoid calling conn_destroy with irqs off */ | ||
168 | spin_lock_irq(&ib_nodev_conns_lock); | ||
169 | list_splice(&ib_nodev_conns, &tmp_list); | ||
170 | INIT_LIST_HEAD(&ib_nodev_conns); | ||
171 | spin_unlock_irq(&ib_nodev_conns_lock); | ||
172 | |||
173 | list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { | ||
174 | if (ic->conn->c_passive) | ||
175 | rds_conn_destroy(ic->conn->c_passive); | ||
176 | rds_conn_destroy(ic->conn); | ||
177 | } | ||
178 | } | ||
179 | |||
180 | void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev) | ||
181 | { | ||
182 | struct rds_ib_connection *ic, *_ic; | ||
183 | LIST_HEAD(tmp_list); | ||
184 | |||
185 | /* avoid calling conn_destroy with irqs off */ | ||
186 | spin_lock_irq(&rds_ibdev->spinlock); | ||
187 | list_splice(&rds_ibdev->conn_list, &tmp_list); | ||
188 | INIT_LIST_HEAD(&rds_ibdev->conn_list); | ||
189 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
190 | |||
191 | list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { | ||
192 | if (ic->conn->c_passive) | ||
193 | rds_conn_destroy(ic->conn->c_passive); | ||
194 | rds_conn_destroy(ic->conn); | ||
195 | } | ||
196 | } | ||
197 | |||
198 | struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) | ||
199 | { | ||
200 | struct rds_ib_mr_pool *pool; | ||
201 | |||
202 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); | ||
203 | if (!pool) | ||
204 | return ERR_PTR(-ENOMEM); | ||
205 | |||
206 | INIT_LIST_HEAD(&pool->free_list); | ||
207 | INIT_LIST_HEAD(&pool->drop_list); | ||
208 | INIT_LIST_HEAD(&pool->clean_list); | ||
209 | mutex_init(&pool->flush_lock); | ||
210 | spin_lock_init(&pool->list_lock); | ||
211 | INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); | ||
212 | |||
213 | pool->fmr_attr.max_pages = fmr_message_size; | ||
214 | pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; | ||
215 | pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift; | ||
216 | pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; | ||
217 | |||
218 | /* We never allow more than max_items MRs to be allocated. | ||
219 | * When we exceed more than max_items_soft, we start freeing | ||
220 | * items more aggressively. | ||
221 | * Make sure that max_items > max_items_soft > max_items / 2 | ||
222 | */ | ||
223 | pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; | ||
224 | pool->max_items = rds_ibdev->max_fmrs; | ||
225 | |||
226 | return pool; | ||
227 | } | ||
228 | |||
229 | void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) | ||
230 | { | ||
231 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | ||
232 | |||
233 | iinfo->rdma_mr_max = pool->max_items; | ||
234 | iinfo->rdma_mr_size = pool->fmr_attr.max_pages; | ||
235 | } | ||
236 | |||
237 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) | ||
238 | { | ||
239 | flush_workqueue(rds_wq); | ||
240 | rds_ib_flush_mr_pool(pool, 1); | ||
241 | BUG_ON(atomic_read(&pool->item_count)); | ||
242 | BUG_ON(atomic_read(&pool->free_pinned)); | ||
243 | kfree(pool); | ||
244 | } | ||
245 | |||
246 | static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) | ||
247 | { | ||
248 | struct rds_ib_mr *ibmr = NULL; | ||
249 | unsigned long flags; | ||
250 | |||
251 | spin_lock_irqsave(&pool->list_lock, flags); | ||
252 | if (!list_empty(&pool->clean_list)) { | ||
253 | ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); | ||
254 | list_del_init(&ibmr->list); | ||
255 | } | ||
256 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
257 | |||
258 | return ibmr; | ||
259 | } | ||
260 | |||
261 | static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) | ||
262 | { | ||
263 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | ||
264 | struct rds_ib_mr *ibmr = NULL; | ||
265 | int err = 0, iter = 0; | ||
266 | |||
267 | while (1) { | ||
268 | ibmr = rds_ib_reuse_fmr(pool); | ||
269 | if (ibmr) | ||
270 | return ibmr; | ||
271 | |||
272 | /* No clean MRs - now we have the choice of either | ||
273 | * allocating a fresh MR up to the limit imposed by the | ||
274 | * driver, or flush any dirty unused MRs. | ||
275 | * We try to avoid stalling in the send path if possible, | ||
276 | * so we allocate as long as we're allowed to. | ||
277 | * | ||
278 | * We're fussy with enforcing the FMR limit, though. If the driver | ||
279 | * tells us we can't use more than N fmrs, we shouldn't start | ||
280 | * arguing with it */ | ||
281 | if (atomic_inc_return(&pool->item_count) <= pool->max_items) | ||
282 | break; | ||
283 | |||
284 | atomic_dec(&pool->item_count); | ||
285 | |||
286 | if (++iter > 2) { | ||
287 | rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); | ||
288 | return ERR_PTR(-EAGAIN); | ||
289 | } | ||
290 | |||
291 | /* We do have some empty MRs. Flush them out. */ | ||
292 | rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); | ||
293 | rds_ib_flush_mr_pool(pool, 0); | ||
294 | } | ||
295 | |||
296 | ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); | ||
297 | if (!ibmr) { | ||
298 | err = -ENOMEM; | ||
299 | goto out_no_cigar; | ||
300 | } | ||
301 | |||
302 | ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, | ||
303 | (IB_ACCESS_LOCAL_WRITE | | ||
304 | IB_ACCESS_REMOTE_READ | | ||
305 | IB_ACCESS_REMOTE_WRITE), | ||
306 | &pool->fmr_attr); | ||
307 | if (IS_ERR(ibmr->fmr)) { | ||
308 | err = PTR_ERR(ibmr->fmr); | ||
309 | ibmr->fmr = NULL; | ||
310 | printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); | ||
311 | goto out_no_cigar; | ||
312 | } | ||
313 | |||
314 | rds_ib_stats_inc(s_ib_rdma_mr_alloc); | ||
315 | return ibmr; | ||
316 | |||
317 | out_no_cigar: | ||
318 | if (ibmr) { | ||
319 | if (ibmr->fmr) | ||
320 | ib_dealloc_fmr(ibmr->fmr); | ||
321 | kfree(ibmr); | ||
322 | } | ||
323 | atomic_dec(&pool->item_count); | ||
324 | return ERR_PTR(err); | ||
325 | } | ||
326 | |||
327 | static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, | ||
328 | struct scatterlist *sg, unsigned int nents) | ||
329 | { | ||
330 | struct ib_device *dev = rds_ibdev->dev; | ||
331 | struct scatterlist *scat = sg; | ||
332 | u64 io_addr = 0; | ||
333 | u64 *dma_pages; | ||
334 | u32 len; | ||
335 | int page_cnt, sg_dma_len; | ||
336 | int i, j; | ||
337 | int ret; | ||
338 | |||
339 | sg_dma_len = ib_dma_map_sg(dev, sg, nents, | ||
340 | DMA_BIDIRECTIONAL); | ||
341 | if (unlikely(!sg_dma_len)) { | ||
342 | printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); | ||
343 | return -EBUSY; | ||
344 | } | ||
345 | |||
346 | len = 0; | ||
347 | page_cnt = 0; | ||
348 | |||
349 | for (i = 0; i < sg_dma_len; ++i) { | ||
350 | unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); | ||
351 | u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); | ||
352 | |||
353 | if (dma_addr & ~rds_ibdev->fmr_page_mask) { | ||
354 | if (i > 0) | ||
355 | return -EINVAL; | ||
356 | else | ||
357 | ++page_cnt; | ||
358 | } | ||
359 | if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) { | ||
360 | if (i < sg_dma_len - 1) | ||
361 | return -EINVAL; | ||
362 | else | ||
363 | ++page_cnt; | ||
364 | } | ||
365 | |||
366 | len += dma_len; | ||
367 | } | ||
368 | |||
369 | page_cnt += len >> rds_ibdev->fmr_page_shift; | ||
370 | if (page_cnt > fmr_message_size) | ||
371 | return -EINVAL; | ||
372 | |||
373 | dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); | ||
374 | if (!dma_pages) | ||
375 | return -ENOMEM; | ||
376 | |||
377 | page_cnt = 0; | ||
378 | for (i = 0; i < sg_dma_len; ++i) { | ||
379 | unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); | ||
380 | u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); | ||
381 | |||
382 | for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size) | ||
383 | dma_pages[page_cnt++] = | ||
384 | (dma_addr & rds_ibdev->fmr_page_mask) + j; | ||
385 | } | ||
386 | |||
387 | ret = ib_map_phys_fmr(ibmr->fmr, | ||
388 | dma_pages, page_cnt, io_addr); | ||
389 | if (ret) | ||
390 | goto out; | ||
391 | |||
392 | /* Success - we successfully remapped the MR, so we can | ||
393 | * safely tear down the old mapping. */ | ||
394 | rds_ib_teardown_mr(ibmr); | ||
395 | |||
396 | ibmr->sg = scat; | ||
397 | ibmr->sg_len = nents; | ||
398 | ibmr->sg_dma_len = sg_dma_len; | ||
399 | ibmr->remap_count++; | ||
400 | |||
401 | rds_ib_stats_inc(s_ib_rdma_mr_used); | ||
402 | ret = 0; | ||
403 | |||
404 | out: | ||
405 | kfree(dma_pages); | ||
406 | |||
407 | return ret; | ||
408 | } | ||
409 | |||
410 | void rds_ib_sync_mr(void *trans_private, int direction) | ||
411 | { | ||
412 | struct rds_ib_mr *ibmr = trans_private; | ||
413 | struct rds_ib_device *rds_ibdev = ibmr->device; | ||
414 | |||
415 | switch (direction) { | ||
416 | case DMA_FROM_DEVICE: | ||
417 | ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, | ||
418 | ibmr->sg_dma_len, DMA_BIDIRECTIONAL); | ||
419 | break; | ||
420 | case DMA_TO_DEVICE: | ||
421 | ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, | ||
422 | ibmr->sg_dma_len, DMA_BIDIRECTIONAL); | ||
423 | break; | ||
424 | } | ||
425 | } | ||
426 | |||
427 | static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) | ||
428 | { | ||
429 | struct rds_ib_device *rds_ibdev = ibmr->device; | ||
430 | |||
431 | if (ibmr->sg_dma_len) { | ||
432 | ib_dma_unmap_sg(rds_ibdev->dev, | ||
433 | ibmr->sg, ibmr->sg_len, | ||
434 | DMA_BIDIRECTIONAL); | ||
435 | ibmr->sg_dma_len = 0; | ||
436 | } | ||
437 | |||
438 | /* Release the s/g list */ | ||
439 | if (ibmr->sg_len) { | ||
440 | unsigned int i; | ||
441 | |||
442 | for (i = 0; i < ibmr->sg_len; ++i) { | ||
443 | struct page *page = sg_page(&ibmr->sg[i]); | ||
444 | |||
445 | /* FIXME we need a way to tell a r/w MR | ||
446 | * from a r/o MR */ | ||
447 | set_page_dirty(page); | ||
448 | put_page(page); | ||
449 | } | ||
450 | kfree(ibmr->sg); | ||
451 | |||
452 | ibmr->sg = NULL; | ||
453 | ibmr->sg_len = 0; | ||
454 | } | ||
455 | } | ||
456 | |||
457 | static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) | ||
458 | { | ||
459 | unsigned int pinned = ibmr->sg_len; | ||
460 | |||
461 | __rds_ib_teardown_mr(ibmr); | ||
462 | if (pinned) { | ||
463 | struct rds_ib_device *rds_ibdev = ibmr->device; | ||
464 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | ||
465 | |||
466 | atomic_sub(pinned, &pool->free_pinned); | ||
467 | } | ||
468 | } | ||
469 | |||
470 | static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) | ||
471 | { | ||
472 | unsigned int item_count; | ||
473 | |||
474 | item_count = atomic_read(&pool->item_count); | ||
475 | if (free_all) | ||
476 | return item_count; | ||
477 | |||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * Flush our pool of MRs. | ||
483 | * At a minimum, all currently unused MRs are unmapped. | ||
484 | * If the number of MRs allocated exceeds the limit, we also try | ||
485 | * to free as many MRs as needed to get back to this limit. | ||
486 | */ | ||
487 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | ||
488 | { | ||
489 | struct rds_ib_mr *ibmr, *next; | ||
490 | LIST_HEAD(unmap_list); | ||
491 | LIST_HEAD(fmr_list); | ||
492 | unsigned long unpinned = 0; | ||
493 | unsigned long flags; | ||
494 | unsigned int nfreed = 0, ncleaned = 0, free_goal; | ||
495 | int ret = 0; | ||
496 | |||
497 | rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); | ||
498 | |||
499 | mutex_lock(&pool->flush_lock); | ||
500 | |||
501 | spin_lock_irqsave(&pool->list_lock, flags); | ||
502 | /* Get the list of all MRs to be dropped. Ordering matters - | ||
503 | * we want to put drop_list ahead of free_list. */ | ||
504 | list_splice_init(&pool->free_list, &unmap_list); | ||
505 | list_splice_init(&pool->drop_list, &unmap_list); | ||
506 | if (free_all) | ||
507 | list_splice_init(&pool->clean_list, &unmap_list); | ||
508 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
509 | |||
510 | free_goal = rds_ib_flush_goal(pool, free_all); | ||
511 | |||
512 | if (list_empty(&unmap_list)) | ||
513 | goto out; | ||
514 | |||
515 | /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ | ||
516 | list_for_each_entry(ibmr, &unmap_list, list) | ||
517 | list_add(&ibmr->fmr->list, &fmr_list); | ||
518 | ret = ib_unmap_fmr(&fmr_list); | ||
519 | if (ret) | ||
520 | printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); | ||
521 | |||
522 | /* Now we can destroy the DMA mapping and unpin any pages */ | ||
523 | list_for_each_entry_safe(ibmr, next, &unmap_list, list) { | ||
524 | unpinned += ibmr->sg_len; | ||
525 | __rds_ib_teardown_mr(ibmr); | ||
526 | if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { | ||
527 | rds_ib_stats_inc(s_ib_rdma_mr_free); | ||
528 | list_del(&ibmr->list); | ||
529 | ib_dealloc_fmr(ibmr->fmr); | ||
530 | kfree(ibmr); | ||
531 | nfreed++; | ||
532 | } | ||
533 | ncleaned++; | ||
534 | } | ||
535 | |||
536 | spin_lock_irqsave(&pool->list_lock, flags); | ||
537 | list_splice(&unmap_list, &pool->clean_list); | ||
538 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
539 | |||
540 | atomic_sub(unpinned, &pool->free_pinned); | ||
541 | atomic_sub(ncleaned, &pool->dirty_count); | ||
542 | atomic_sub(nfreed, &pool->item_count); | ||
543 | |||
544 | out: | ||
545 | mutex_unlock(&pool->flush_lock); | ||
546 | return ret; | ||
547 | } | ||
548 | |||
549 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work) | ||
550 | { | ||
551 | struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); | ||
552 | |||
553 | rds_ib_flush_mr_pool(pool, 0); | ||
554 | } | ||
555 | |||
556 | void rds_ib_free_mr(void *trans_private, int invalidate) | ||
557 | { | ||
558 | struct rds_ib_mr *ibmr = trans_private; | ||
559 | struct rds_ib_device *rds_ibdev = ibmr->device; | ||
560 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | ||
561 | unsigned long flags; | ||
562 | |||
563 | rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); | ||
564 | |||
565 | /* Return it to the pool's free list */ | ||
566 | spin_lock_irqsave(&pool->list_lock, flags); | ||
567 | if (ibmr->remap_count >= pool->fmr_attr.max_maps) | ||
568 | list_add(&ibmr->list, &pool->drop_list); | ||
569 | else | ||
570 | list_add(&ibmr->list, &pool->free_list); | ||
571 | |||
572 | atomic_add(ibmr->sg_len, &pool->free_pinned); | ||
573 | atomic_inc(&pool->dirty_count); | ||
574 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
575 | |||
576 | /* If we've pinned too many pages, request a flush */ | ||
577 | if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned | ||
578 | || atomic_read(&pool->dirty_count) >= pool->max_items / 10) | ||
579 | queue_work(rds_wq, &pool->flush_worker); | ||
580 | |||
581 | if (invalidate) { | ||
582 | if (likely(!in_interrupt())) { | ||
583 | rds_ib_flush_mr_pool(pool, 0); | ||
584 | } else { | ||
585 | /* We get here if the user created a MR marked | ||
586 | * as use_once and invalidate at the same time. */ | ||
587 | queue_work(rds_wq, &pool->flush_worker); | ||
588 | } | ||
589 | } | ||
590 | } | ||
591 | |||
592 | void rds_ib_flush_mrs(void) | ||
593 | { | ||
594 | struct rds_ib_device *rds_ibdev; | ||
595 | |||
596 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { | ||
597 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | ||
598 | |||
599 | if (pool) | ||
600 | rds_ib_flush_mr_pool(pool, 0); | ||
601 | } | ||
602 | } | ||
603 | |||
604 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | ||
605 | struct rds_sock *rs, u32 *key_ret) | ||
606 | { | ||
607 | struct rds_ib_device *rds_ibdev; | ||
608 | struct rds_ib_mr *ibmr = NULL; | ||
609 | int ret; | ||
610 | |||
611 | rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); | ||
612 | if (!rds_ibdev) { | ||
613 | ret = -ENODEV; | ||
614 | goto out; | ||
615 | } | ||
616 | |||
617 | if (!rds_ibdev->mr_pool) { | ||
618 | ret = -ENODEV; | ||
619 | goto out; | ||
620 | } | ||
621 | |||
622 | ibmr = rds_ib_alloc_fmr(rds_ibdev); | ||
623 | if (IS_ERR(ibmr)) | ||
624 | return ibmr; | ||
625 | |||
626 | ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); | ||
627 | if (ret == 0) | ||
628 | *key_ret = ibmr->fmr->rkey; | ||
629 | else | ||
630 | printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); | ||
631 | |||
632 | ibmr->device = rds_ibdev; | ||
633 | |||
634 | out: | ||
635 | if (ret) { | ||
636 | if (ibmr) | ||
637 | rds_ib_free_mr(ibmr, 0); | ||
638 | ibmr = ERR_PTR(ret); | ||
639 | } | ||
640 | return ibmr; | ||
641 | } | ||
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c new file mode 100644 index 000000000000..5061b5502162 --- /dev/null +++ b/net/rds/ib_recv.c | |||
@@ -0,0 +1,869 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/pci.h> | ||
35 | #include <linux/dma-mapping.h> | ||
36 | #include <rdma/rdma_cm.h> | ||
37 | |||
38 | #include "rds.h" | ||
39 | #include "ib.h" | ||
40 | |||
41 | static struct kmem_cache *rds_ib_incoming_slab; | ||
42 | static struct kmem_cache *rds_ib_frag_slab; | ||
43 | static atomic_t rds_ib_allocation = ATOMIC_INIT(0); | ||
44 | |||
45 | static void rds_ib_frag_drop_page(struct rds_page_frag *frag) | ||
46 | { | ||
47 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
48 | __free_page(frag->f_page); | ||
49 | frag->f_page = NULL; | ||
50 | } | ||
51 | |||
52 | static void rds_ib_frag_free(struct rds_page_frag *frag) | ||
53 | { | ||
54 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
55 | BUG_ON(frag->f_page != NULL); | ||
56 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * We map a page at a time. Its fragments are posted in order. This | ||
61 | * is called in fragment order as the fragments get send completion events. | ||
62 | * Only the last frag in the page performs the unmapping. | ||
63 | * | ||
64 | * It's OK for ring cleanup to call this in whatever order it likes because | ||
65 | * DMA is not in flight and so we can unmap while other ring entries still | ||
66 | * hold page references in their frags. | ||
67 | */ | ||
68 | static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, | ||
69 | struct rds_ib_recv_work *recv) | ||
70 | { | ||
71 | struct rds_page_frag *frag = recv->r_frag; | ||
72 | |||
73 | rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); | ||
74 | if (frag->f_mapped) | ||
75 | ib_dma_unmap_page(ic->i_cm_id->device, | ||
76 | frag->f_mapped, | ||
77 | RDS_FRAG_SIZE, DMA_FROM_DEVICE); | ||
78 | frag->f_mapped = 0; | ||
79 | } | ||
80 | |||
81 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic) | ||
82 | { | ||
83 | struct rds_ib_recv_work *recv; | ||
84 | u32 i; | ||
85 | |||
86 | for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { | ||
87 | struct ib_sge *sge; | ||
88 | |||
89 | recv->r_ibinc = NULL; | ||
90 | recv->r_frag = NULL; | ||
91 | |||
92 | recv->r_wr.next = NULL; | ||
93 | recv->r_wr.wr_id = i; | ||
94 | recv->r_wr.sg_list = recv->r_sge; | ||
95 | recv->r_wr.num_sge = RDS_IB_RECV_SGE; | ||
96 | |||
97 | sge = rds_ib_data_sge(ic, recv->r_sge); | ||
98 | sge->addr = 0; | ||
99 | sge->length = RDS_FRAG_SIZE; | ||
100 | sge->lkey = ic->i_mr->lkey; | ||
101 | |||
102 | sge = rds_ib_header_sge(ic, recv->r_sge); | ||
103 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | ||
104 | sge->length = sizeof(struct rds_header); | ||
105 | sge->lkey = ic->i_mr->lkey; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, | ||
110 | struct rds_ib_recv_work *recv) | ||
111 | { | ||
112 | if (recv->r_ibinc) { | ||
113 | rds_inc_put(&recv->r_ibinc->ii_inc); | ||
114 | recv->r_ibinc = NULL; | ||
115 | } | ||
116 | if (recv->r_frag) { | ||
117 | rds_ib_recv_unmap_page(ic, recv); | ||
118 | if (recv->r_frag->f_page) | ||
119 | rds_ib_frag_drop_page(recv->r_frag); | ||
120 | rds_ib_frag_free(recv->r_frag); | ||
121 | recv->r_frag = NULL; | ||
122 | } | ||
123 | } | ||
124 | |||
125 | void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) | ||
126 | { | ||
127 | u32 i; | ||
128 | |||
129 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) | ||
130 | rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); | ||
131 | |||
132 | if (ic->i_frag.f_page) | ||
133 | rds_ib_frag_drop_page(&ic->i_frag); | ||
134 | } | ||
135 | |||
136 | static int rds_ib_recv_refill_one(struct rds_connection *conn, | ||
137 | struct rds_ib_recv_work *recv, | ||
138 | gfp_t kptr_gfp, gfp_t page_gfp) | ||
139 | { | ||
140 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
141 | dma_addr_t dma_addr; | ||
142 | struct ib_sge *sge; | ||
143 | int ret = -ENOMEM; | ||
144 | |||
145 | if (recv->r_ibinc == NULL) { | ||
146 | if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) { | ||
147 | rds_ib_stats_inc(s_ib_rx_alloc_limit); | ||
148 | goto out; | ||
149 | } | ||
150 | recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, | ||
151 | kptr_gfp); | ||
152 | if (recv->r_ibinc == NULL) | ||
153 | goto out; | ||
154 | atomic_inc(&rds_ib_allocation); | ||
155 | INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); | ||
156 | rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); | ||
157 | } | ||
158 | |||
159 | if (recv->r_frag == NULL) { | ||
160 | recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); | ||
161 | if (recv->r_frag == NULL) | ||
162 | goto out; | ||
163 | INIT_LIST_HEAD(&recv->r_frag->f_item); | ||
164 | recv->r_frag->f_page = NULL; | ||
165 | } | ||
166 | |||
167 | if (ic->i_frag.f_page == NULL) { | ||
168 | ic->i_frag.f_page = alloc_page(page_gfp); | ||
169 | if (ic->i_frag.f_page == NULL) | ||
170 | goto out; | ||
171 | ic->i_frag.f_offset = 0; | ||
172 | } | ||
173 | |||
174 | dma_addr = ib_dma_map_page(ic->i_cm_id->device, | ||
175 | ic->i_frag.f_page, | ||
176 | ic->i_frag.f_offset, | ||
177 | RDS_FRAG_SIZE, | ||
178 | DMA_FROM_DEVICE); | ||
179 | if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) | ||
180 | goto out; | ||
181 | |||
182 | /* | ||
183 | * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() | ||
184 | * must be called on this recv. This happens as completions hit | ||
185 | * in order or on connection shutdown. | ||
186 | */ | ||
187 | recv->r_frag->f_page = ic->i_frag.f_page; | ||
188 | recv->r_frag->f_offset = ic->i_frag.f_offset; | ||
189 | recv->r_frag->f_mapped = dma_addr; | ||
190 | |||
191 | sge = rds_ib_data_sge(ic, recv->r_sge); | ||
192 | sge->addr = dma_addr; | ||
193 | sge->length = RDS_FRAG_SIZE; | ||
194 | |||
195 | sge = rds_ib_header_sge(ic, recv->r_sge); | ||
196 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); | ||
197 | sge->length = sizeof(struct rds_header); | ||
198 | |||
199 | get_page(recv->r_frag->f_page); | ||
200 | |||
201 | if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { | ||
202 | ic->i_frag.f_offset += RDS_FRAG_SIZE; | ||
203 | } else { | ||
204 | put_page(ic->i_frag.f_page); | ||
205 | ic->i_frag.f_page = NULL; | ||
206 | ic->i_frag.f_offset = 0; | ||
207 | } | ||
208 | |||
209 | ret = 0; | ||
210 | out: | ||
211 | return ret; | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * This tries to allocate and post unused work requests after making sure that | ||
216 | * they have all the allocations they need to queue received fragments into | ||
217 | * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc | ||
218 | * pairs don't go unmatched. | ||
219 | * | ||
220 | * -1 is returned if posting fails due to temporary resource exhaustion. | ||
221 | */ | ||
222 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | ||
223 | gfp_t page_gfp, int prefill) | ||
224 | { | ||
225 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
226 | struct rds_ib_recv_work *recv; | ||
227 | struct ib_recv_wr *failed_wr; | ||
228 | unsigned int posted = 0; | ||
229 | int ret = 0; | ||
230 | u32 pos; | ||
231 | |||
232 | while ((prefill || rds_conn_up(conn)) | ||
233 | && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { | ||
234 | if (pos >= ic->i_recv_ring.w_nr) { | ||
235 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", | ||
236 | pos); | ||
237 | ret = -EINVAL; | ||
238 | break; | ||
239 | } | ||
240 | |||
241 | recv = &ic->i_recvs[pos]; | ||
242 | ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); | ||
243 | if (ret) { | ||
244 | ret = -1; | ||
245 | break; | ||
246 | } | ||
247 | |||
248 | /* XXX when can this fail? */ | ||
249 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); | ||
250 | rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, | ||
251 | recv->r_ibinc, recv->r_frag->f_page, | ||
252 | (long) recv->r_frag->f_mapped, ret); | ||
253 | if (ret) { | ||
254 | rds_ib_conn_error(conn, "recv post on " | ||
255 | "%pI4 returned %d, disconnecting and " | ||
256 | "reconnecting\n", &conn->c_faddr, | ||
257 | ret); | ||
258 | ret = -1; | ||
259 | break; | ||
260 | } | ||
261 | |||
262 | posted++; | ||
263 | } | ||
264 | |||
265 | /* We're doing flow control - update the window. */ | ||
266 | if (ic->i_flowctl && posted) | ||
267 | rds_ib_advertise_credits(conn, posted); | ||
268 | |||
269 | if (ret) | ||
270 | rds_ib_ring_unalloc(&ic->i_recv_ring, 1); | ||
271 | return ret; | ||
272 | } | ||
273 | |||
274 | void rds_ib_inc_purge(struct rds_incoming *inc) | ||
275 | { | ||
276 | struct rds_ib_incoming *ibinc; | ||
277 | struct rds_page_frag *frag; | ||
278 | struct rds_page_frag *pos; | ||
279 | |||
280 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | ||
281 | rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); | ||
282 | |||
283 | list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { | ||
284 | list_del_init(&frag->f_item); | ||
285 | rds_ib_frag_drop_page(frag); | ||
286 | rds_ib_frag_free(frag); | ||
287 | } | ||
288 | } | ||
289 | |||
290 | void rds_ib_inc_free(struct rds_incoming *inc) | ||
291 | { | ||
292 | struct rds_ib_incoming *ibinc; | ||
293 | |||
294 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | ||
295 | |||
296 | rds_ib_inc_purge(inc); | ||
297 | rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); | ||
298 | BUG_ON(!list_empty(&ibinc->ii_frags)); | ||
299 | kmem_cache_free(rds_ib_incoming_slab, ibinc); | ||
300 | atomic_dec(&rds_ib_allocation); | ||
301 | BUG_ON(atomic_read(&rds_ib_allocation) < 0); | ||
302 | } | ||
303 | |||
304 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | ||
305 | size_t size) | ||
306 | { | ||
307 | struct rds_ib_incoming *ibinc; | ||
308 | struct rds_page_frag *frag; | ||
309 | struct iovec *iov = first_iov; | ||
310 | unsigned long to_copy; | ||
311 | unsigned long frag_off = 0; | ||
312 | unsigned long iov_off = 0; | ||
313 | int copied = 0; | ||
314 | int ret; | ||
315 | u32 len; | ||
316 | |||
317 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | ||
318 | frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); | ||
319 | len = be32_to_cpu(inc->i_hdr.h_len); | ||
320 | |||
321 | while (copied < size && copied < len) { | ||
322 | if (frag_off == RDS_FRAG_SIZE) { | ||
323 | frag = list_entry(frag->f_item.next, | ||
324 | struct rds_page_frag, f_item); | ||
325 | frag_off = 0; | ||
326 | } | ||
327 | while (iov_off == iov->iov_len) { | ||
328 | iov_off = 0; | ||
329 | iov++; | ||
330 | } | ||
331 | |||
332 | to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); | ||
333 | to_copy = min_t(size_t, to_copy, size - copied); | ||
334 | to_copy = min_t(unsigned long, to_copy, len - copied); | ||
335 | |||
336 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " | ||
337 | "[%p, %lu] + %lu\n", | ||
338 | to_copy, iov->iov_base, iov->iov_len, iov_off, | ||
339 | frag->f_page, frag->f_offset, frag_off); | ||
340 | |||
341 | /* XXX needs + offset for multiple recvs per page */ | ||
342 | ret = rds_page_copy_to_user(frag->f_page, | ||
343 | frag->f_offset + frag_off, | ||
344 | iov->iov_base + iov_off, | ||
345 | to_copy); | ||
346 | if (ret) { | ||
347 | copied = ret; | ||
348 | break; | ||
349 | } | ||
350 | |||
351 | iov_off += to_copy; | ||
352 | frag_off += to_copy; | ||
353 | copied += to_copy; | ||
354 | } | ||
355 | |||
356 | return copied; | ||
357 | } | ||
358 | |||
359 | /* ic starts out kzalloc()ed */ | ||
360 | void rds_ib_recv_init_ack(struct rds_ib_connection *ic) | ||
361 | { | ||
362 | struct ib_send_wr *wr = &ic->i_ack_wr; | ||
363 | struct ib_sge *sge = &ic->i_ack_sge; | ||
364 | |||
365 | sge->addr = ic->i_ack_dma; | ||
366 | sge->length = sizeof(struct rds_header); | ||
367 | sge->lkey = ic->i_mr->lkey; | ||
368 | |||
369 | wr->sg_list = sge; | ||
370 | wr->num_sge = 1; | ||
371 | wr->opcode = IB_WR_SEND; | ||
372 | wr->wr_id = RDS_IB_ACK_WR_ID; | ||
373 | wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
374 | } | ||
375 | |||
376 | /* | ||
377 | * You'd think that with reliable IB connections you wouldn't need to ack | ||
378 | * messages that have been received. The problem is that IB hardware generates | ||
379 | * an ack message before it has DMAed the message into memory. This creates a | ||
380 | * potential message loss if the HCA is disabled for any reason between when it | ||
381 | * sends the ack and before the message is DMAed and processed. This is only a | ||
382 | * potential issue if another HCA is available for fail-over. | ||
383 | * | ||
384 | * When the remote host receives our ack they'll free the sent message from | ||
385 | * their send queue. To decrease the latency of this we always send an ack | ||
386 | * immediately after we've received messages. | ||
387 | * | ||
388 | * For simplicity, we only have one ack in flight at a time. This puts | ||
389 | * pressure on senders to have deep enough send queues to absorb the latency of | ||
390 | * a single ack frame being in flight. This might not be good enough. | ||
391 | * | ||
392 | * This is implemented by have a long-lived send_wr and sge which point to a | ||
393 | * statically allocated ack frame. This ack wr does not fall under the ring | ||
394 | * accounting that the tx and rx wrs do. The QP attribute specifically makes | ||
395 | * room for it beyond the ring size. Send completion notices its special | ||
396 | * wr_id and avoids working with the ring in that case. | ||
397 | */ | ||
398 | static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, | ||
399 | int ack_required) | ||
400 | { | ||
401 | rds_ib_set_64bit(&ic->i_ack_next, seq); | ||
402 | if (ack_required) { | ||
403 | smp_mb__before_clear_bit(); | ||
404 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
405 | } | ||
406 | } | ||
407 | |||
408 | static u64 rds_ib_get_ack(struct rds_ib_connection *ic) | ||
409 | { | ||
410 | clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
411 | smp_mb__after_clear_bit(); | ||
412 | |||
413 | return ic->i_ack_next; | ||
414 | } | ||
415 | |||
416 | static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) | ||
417 | { | ||
418 | struct rds_header *hdr = ic->i_ack; | ||
419 | struct ib_send_wr *failed_wr; | ||
420 | u64 seq; | ||
421 | int ret; | ||
422 | |||
423 | seq = rds_ib_get_ack(ic); | ||
424 | |||
425 | rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); | ||
426 | rds_message_populate_header(hdr, 0, 0, 0); | ||
427 | hdr->h_ack = cpu_to_be64(seq); | ||
428 | hdr->h_credit = adv_credits; | ||
429 | rds_message_make_checksum(hdr); | ||
430 | ic->i_ack_queued = jiffies; | ||
431 | |||
432 | ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); | ||
433 | if (unlikely(ret)) { | ||
434 | /* Failed to send. Release the WR, and | ||
435 | * force another ACK. | ||
436 | */ | ||
437 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
438 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
439 | |||
440 | rds_ib_stats_inc(s_ib_ack_send_failure); | ||
441 | /* Need to finesse this later. */ | ||
442 | BUG(); | ||
443 | } else | ||
444 | rds_ib_stats_inc(s_ib_ack_sent); | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * There are 3 ways of getting acknowledgements to the peer: | ||
449 | * 1. We call rds_ib_attempt_ack from the recv completion handler | ||
450 | * to send an ACK-only frame. | ||
451 | * However, there can be only one such frame in the send queue | ||
452 | * at any time, so we may have to postpone it. | ||
453 | * 2. When another (data) packet is transmitted while there's | ||
454 | * an ACK in the queue, we piggyback the ACK sequence number | ||
455 | * on the data packet. | ||
456 | * 3. If the ACK WR is done sending, we get called from the | ||
457 | * send queue completion handler, and check whether there's | ||
458 | * another ACK pending (postponed because the WR was on the | ||
459 | * queue). If so, we transmit it. | ||
460 | * | ||
461 | * We maintain 2 variables: | ||
462 | * - i_ack_flags, which keeps track of whether the ACK WR | ||
463 | * is currently in the send queue or not (IB_ACK_IN_FLIGHT) | ||
464 | * - i_ack_next, which is the last sequence number we received | ||
465 | * | ||
466 | * Potentially, send queue and receive queue handlers can run concurrently. | ||
467 | * | ||
468 | * Reconnecting complicates this picture just slightly. When we | ||
469 | * reconnect, we may be seeing duplicate packets. The peer | ||
470 | * is retransmitting them, because it hasn't seen an ACK for | ||
471 | * them. It is important that we ACK these. | ||
472 | * | ||
473 | * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with | ||
474 | * this flag set *MUST* be acknowledged immediately. | ||
475 | */ | ||
476 | |||
477 | /* | ||
478 | * When we get here, we're called from the recv queue handler. | ||
479 | * Check whether we ought to transmit an ACK. | ||
480 | */ | ||
481 | void rds_ib_attempt_ack(struct rds_ib_connection *ic) | ||
482 | { | ||
483 | unsigned int adv_credits; | ||
484 | |||
485 | if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) | ||
486 | return; | ||
487 | |||
488 | if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { | ||
489 | rds_ib_stats_inc(s_ib_ack_send_delayed); | ||
490 | return; | ||
491 | } | ||
492 | |||
493 | /* Can we get a send credit? */ | ||
494 | if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) { | ||
495 | rds_ib_stats_inc(s_ib_tx_throttle); | ||
496 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
497 | return; | ||
498 | } | ||
499 | |||
500 | clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
501 | rds_ib_send_ack(ic, adv_credits); | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * We get here from the send completion handler, when the | ||
506 | * adapter tells us the ACK frame was sent. | ||
507 | */ | ||
508 | void rds_ib_ack_send_complete(struct rds_ib_connection *ic) | ||
509 | { | ||
510 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
511 | rds_ib_attempt_ack(ic); | ||
512 | } | ||
513 | |||
514 | /* | ||
515 | * This is called by the regular xmit code when it wants to piggyback | ||
516 | * an ACK on an outgoing frame. | ||
517 | */ | ||
518 | u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) | ||
519 | { | ||
520 | if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) | ||
521 | rds_ib_stats_inc(s_ib_ack_send_piggybacked); | ||
522 | return rds_ib_get_ack(ic); | ||
523 | } | ||
524 | |||
525 | /* | ||
526 | * It's kind of lame that we're copying from the posted receive pages into | ||
527 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into | ||
528 | * them. But receiving new congestion bitmaps should be a *rare* event, so | ||
529 | * hopefully we won't need to invest that complexity in making it more | ||
530 | * efficient. By copying we can share a simpler core with TCP which has to | ||
531 | * copy. | ||
532 | */ | ||
533 | static void rds_ib_cong_recv(struct rds_connection *conn, | ||
534 | struct rds_ib_incoming *ibinc) | ||
535 | { | ||
536 | struct rds_cong_map *map; | ||
537 | unsigned int map_off; | ||
538 | unsigned int map_page; | ||
539 | struct rds_page_frag *frag; | ||
540 | unsigned long frag_off; | ||
541 | unsigned long to_copy; | ||
542 | unsigned long copied; | ||
543 | uint64_t uncongested = 0; | ||
544 | void *addr; | ||
545 | |||
546 | /* catch completely corrupt packets */ | ||
547 | if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) | ||
548 | return; | ||
549 | |||
550 | map = conn->c_fcong; | ||
551 | map_page = 0; | ||
552 | map_off = 0; | ||
553 | |||
554 | frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); | ||
555 | frag_off = 0; | ||
556 | |||
557 | copied = 0; | ||
558 | |||
559 | while (copied < RDS_CONG_MAP_BYTES) { | ||
560 | uint64_t *src, *dst; | ||
561 | unsigned int k; | ||
562 | |||
563 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); | ||
564 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ | ||
565 | |||
566 | addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); | ||
567 | |||
568 | src = addr + frag_off; | ||
569 | dst = (void *)map->m_page_addrs[map_page] + map_off; | ||
570 | for (k = 0; k < to_copy; k += 8) { | ||
571 | /* Record ports that became uncongested, ie | ||
572 | * bits that changed from 0 to 1. */ | ||
573 | uncongested |= ~(*src) & *dst; | ||
574 | *dst++ = *src++; | ||
575 | } | ||
576 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
577 | |||
578 | copied += to_copy; | ||
579 | |||
580 | map_off += to_copy; | ||
581 | if (map_off == PAGE_SIZE) { | ||
582 | map_off = 0; | ||
583 | map_page++; | ||
584 | } | ||
585 | |||
586 | frag_off += to_copy; | ||
587 | if (frag_off == RDS_FRAG_SIZE) { | ||
588 | frag = list_entry(frag->f_item.next, | ||
589 | struct rds_page_frag, f_item); | ||
590 | frag_off = 0; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | /* the congestion map is in little endian order */ | ||
595 | uncongested = le64_to_cpu(uncongested); | ||
596 | |||
597 | rds_cong_map_updated(map, uncongested); | ||
598 | } | ||
599 | |||
600 | /* | ||
601 | * Rings are posted with all the allocations they'll need to queue the | ||
602 | * incoming message to the receiving socket so this can't fail. | ||
603 | * All fragments start with a header, so we can make sure we're not receiving | ||
604 | * garbage, and we can tell a small 8 byte fragment from an ACK frame. | ||
605 | */ | ||
606 | struct rds_ib_ack_state { | ||
607 | u64 ack_next; | ||
608 | u64 ack_recv; | ||
609 | unsigned int ack_required:1; | ||
610 | unsigned int ack_next_valid:1; | ||
611 | unsigned int ack_recv_valid:1; | ||
612 | }; | ||
613 | |||
614 | static void rds_ib_process_recv(struct rds_connection *conn, | ||
615 | struct rds_ib_recv_work *recv, u32 byte_len, | ||
616 | struct rds_ib_ack_state *state) | ||
617 | { | ||
618 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
619 | struct rds_ib_incoming *ibinc = ic->i_ibinc; | ||
620 | struct rds_header *ihdr, *hdr; | ||
621 | |||
622 | /* XXX shut down the connection if port 0,0 are seen? */ | ||
623 | |||
624 | rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, | ||
625 | byte_len); | ||
626 | |||
627 | if (byte_len < sizeof(struct rds_header)) { | ||
628 | rds_ib_conn_error(conn, "incoming message " | ||
629 | "from %pI4 didn't inclue a " | ||
630 | "header, disconnecting and " | ||
631 | "reconnecting\n", | ||
632 | &conn->c_faddr); | ||
633 | return; | ||
634 | } | ||
635 | byte_len -= sizeof(struct rds_header); | ||
636 | |||
637 | ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; | ||
638 | |||
639 | /* Validate the checksum. */ | ||
640 | if (!rds_message_verify_checksum(ihdr)) { | ||
641 | rds_ib_conn_error(conn, "incoming message " | ||
642 | "from %pI4 has corrupted header - " | ||
643 | "forcing a reconnect\n", | ||
644 | &conn->c_faddr); | ||
645 | rds_stats_inc(s_recv_drop_bad_checksum); | ||
646 | return; | ||
647 | } | ||
648 | |||
649 | /* Process the ACK sequence which comes with every packet */ | ||
650 | state->ack_recv = be64_to_cpu(ihdr->h_ack); | ||
651 | state->ack_recv_valid = 1; | ||
652 | |||
653 | /* Process the credits update if there was one */ | ||
654 | if (ihdr->h_credit) | ||
655 | rds_ib_send_add_credits(conn, ihdr->h_credit); | ||
656 | |||
657 | if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { | ||
658 | /* This is an ACK-only packet. The fact that it gets | ||
659 | * special treatment here is that historically, ACKs | ||
660 | * were rather special beasts. | ||
661 | */ | ||
662 | rds_ib_stats_inc(s_ib_ack_received); | ||
663 | |||
664 | /* | ||
665 | * Usually the frags make their way on to incs and are then freed as | ||
666 | * the inc is freed. We don't go that route, so we have to drop the | ||
667 | * page ref ourselves. We can't just leave the page on the recv | ||
668 | * because that confuses the dma mapping of pages and each recv's use | ||
669 | * of a partial page. We can leave the frag, though, it will be | ||
670 | * reused. | ||
671 | * | ||
672 | * FIXME: Fold this into the code path below. | ||
673 | */ | ||
674 | rds_ib_frag_drop_page(recv->r_frag); | ||
675 | return; | ||
676 | } | ||
677 | |||
678 | /* | ||
679 | * If we don't already have an inc on the connection then this | ||
680 | * fragment has a header and starts a message.. copy its header | ||
681 | * into the inc and save the inc so we can hang upcoming fragments | ||
682 | * off its list. | ||
683 | */ | ||
684 | if (ibinc == NULL) { | ||
685 | ibinc = recv->r_ibinc; | ||
686 | recv->r_ibinc = NULL; | ||
687 | ic->i_ibinc = ibinc; | ||
688 | |||
689 | hdr = &ibinc->ii_inc.i_hdr; | ||
690 | memcpy(hdr, ihdr, sizeof(*hdr)); | ||
691 | ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); | ||
692 | |||
693 | rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, | ||
694 | ic->i_recv_data_rem, hdr->h_flags); | ||
695 | } else { | ||
696 | hdr = &ibinc->ii_inc.i_hdr; | ||
697 | /* We can't just use memcmp here; fragments of a | ||
698 | * single message may carry different ACKs */ | ||
699 | if (hdr->h_sequence != ihdr->h_sequence | ||
700 | || hdr->h_len != ihdr->h_len | ||
701 | || hdr->h_sport != ihdr->h_sport | ||
702 | || hdr->h_dport != ihdr->h_dport) { | ||
703 | rds_ib_conn_error(conn, | ||
704 | "fragment header mismatch; forcing reconnect\n"); | ||
705 | return; | ||
706 | } | ||
707 | } | ||
708 | |||
709 | list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags); | ||
710 | recv->r_frag = NULL; | ||
711 | |||
712 | if (ic->i_recv_data_rem > RDS_FRAG_SIZE) | ||
713 | ic->i_recv_data_rem -= RDS_FRAG_SIZE; | ||
714 | else { | ||
715 | ic->i_recv_data_rem = 0; | ||
716 | ic->i_ibinc = NULL; | ||
717 | |||
718 | if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) | ||
719 | rds_ib_cong_recv(conn, ibinc); | ||
720 | else { | ||
721 | rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, | ||
722 | &ibinc->ii_inc, GFP_ATOMIC, | ||
723 | KM_SOFTIRQ0); | ||
724 | state->ack_next = be64_to_cpu(hdr->h_sequence); | ||
725 | state->ack_next_valid = 1; | ||
726 | } | ||
727 | |||
728 | /* Evaluate the ACK_REQUIRED flag *after* we received | ||
729 | * the complete frame, and after bumping the next_rx | ||
730 | * sequence. */ | ||
731 | if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { | ||
732 | rds_stats_inc(s_recv_ack_required); | ||
733 | state->ack_required = 1; | ||
734 | } | ||
735 | |||
736 | rds_inc_put(&ibinc->ii_inc); | ||
737 | } | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Plucking the oldest entry from the ring can be done concurrently with | ||
742 | * the thread refilling the ring. Each ring operation is protected by | ||
743 | * spinlocks and the transient state of refilling doesn't change the | ||
744 | * recording of which entry is oldest. | ||
745 | * | ||
746 | * This relies on IB only calling one cq comp_handler for each cq so that | ||
747 | * there will only be one caller of rds_recv_incoming() per RDS connection. | ||
748 | */ | ||
749 | void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) | ||
750 | { | ||
751 | struct rds_connection *conn = context; | ||
752 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
753 | struct ib_wc wc; | ||
754 | struct rds_ib_ack_state state = { 0, }; | ||
755 | struct rds_ib_recv_work *recv; | ||
756 | |||
757 | rdsdebug("conn %p cq %p\n", conn, cq); | ||
758 | |||
759 | rds_ib_stats_inc(s_ib_rx_cq_call); | ||
760 | |||
761 | ib_req_notify_cq(cq, IB_CQ_SOLICITED); | ||
762 | |||
763 | while (ib_poll_cq(cq, 1, &wc) > 0) { | ||
764 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | ||
765 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | ||
766 | be32_to_cpu(wc.ex.imm_data)); | ||
767 | rds_ib_stats_inc(s_ib_rx_cq_event); | ||
768 | |||
769 | recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; | ||
770 | |||
771 | rds_ib_recv_unmap_page(ic, recv); | ||
772 | |||
773 | /* | ||
774 | * Also process recvs in connecting state because it is possible | ||
775 | * to get a recv completion _before_ the rdmacm ESTABLISHED | ||
776 | * event is processed. | ||
777 | */ | ||
778 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) { | ||
779 | /* We expect errors as the qp is drained during shutdown */ | ||
780 | if (wc.status == IB_WC_SUCCESS) { | ||
781 | rds_ib_process_recv(conn, recv, wc.byte_len, &state); | ||
782 | } else { | ||
783 | rds_ib_conn_error(conn, "recv completion on " | ||
784 | "%pI4 had status %u, disconnecting and " | ||
785 | "reconnecting\n", &conn->c_faddr, | ||
786 | wc.status); | ||
787 | } | ||
788 | } | ||
789 | |||
790 | rds_ib_ring_free(&ic->i_recv_ring, 1); | ||
791 | } | ||
792 | |||
793 | if (state.ack_next_valid) | ||
794 | rds_ib_set_ack(ic, state.ack_next, state.ack_required); | ||
795 | if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { | ||
796 | rds_send_drop_acked(conn, state.ack_recv, NULL); | ||
797 | ic->i_ack_recv = state.ack_recv; | ||
798 | } | ||
799 | if (rds_conn_up(conn)) | ||
800 | rds_ib_attempt_ack(ic); | ||
801 | |||
802 | /* If we ever end up with a really empty receive ring, we're | ||
803 | * in deep trouble, as the sender will definitely see RNR | ||
804 | * timeouts. */ | ||
805 | if (rds_ib_ring_empty(&ic->i_recv_ring)) | ||
806 | rds_ib_stats_inc(s_ib_rx_ring_empty); | ||
807 | |||
808 | /* | ||
809 | * If the ring is running low, then schedule the thread to refill. | ||
810 | */ | ||
811 | if (rds_ib_ring_low(&ic->i_recv_ring)) | ||
812 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | ||
813 | } | ||
814 | |||
815 | int rds_ib_recv(struct rds_connection *conn) | ||
816 | { | ||
817 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
818 | int ret = 0; | ||
819 | |||
820 | rdsdebug("conn %p\n", conn); | ||
821 | |||
822 | /* | ||
823 | * If we get a temporary posting failure in this context then | ||
824 | * we're really low and we want the caller to back off for a bit. | ||
825 | */ | ||
826 | mutex_lock(&ic->i_recv_mutex); | ||
827 | if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) | ||
828 | ret = -ENOMEM; | ||
829 | else | ||
830 | rds_ib_stats_inc(s_ib_rx_refill_from_thread); | ||
831 | mutex_unlock(&ic->i_recv_mutex); | ||
832 | |||
833 | if (rds_conn_up(conn)) | ||
834 | rds_ib_attempt_ack(ic); | ||
835 | |||
836 | return ret; | ||
837 | } | ||
838 | |||
839 | int __init rds_ib_recv_init(void) | ||
840 | { | ||
841 | struct sysinfo si; | ||
842 | int ret = -ENOMEM; | ||
843 | |||
844 | /* Default to 30% of all available RAM for recv memory */ | ||
845 | si_meminfo(&si); | ||
846 | rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; | ||
847 | |||
848 | rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", | ||
849 | sizeof(struct rds_ib_incoming), | ||
850 | 0, 0, NULL); | ||
851 | if (rds_ib_incoming_slab == NULL) | ||
852 | goto out; | ||
853 | |||
854 | rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", | ||
855 | sizeof(struct rds_page_frag), | ||
856 | 0, 0, NULL); | ||
857 | if (rds_ib_frag_slab == NULL) | ||
858 | kmem_cache_destroy(rds_ib_incoming_slab); | ||
859 | else | ||
860 | ret = 0; | ||
861 | out: | ||
862 | return ret; | ||
863 | } | ||
864 | |||
865 | void rds_ib_recv_exit(void) | ||
866 | { | ||
867 | kmem_cache_destroy(rds_ib_incoming_slab); | ||
868 | kmem_cache_destroy(rds_ib_frag_slab); | ||
869 | } | ||
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c new file mode 100644 index 000000000000..99a6ccae964c --- /dev/null +++ b/net/rds/ib_ring.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | #include "ib.h" | ||
37 | |||
38 | /* | ||
39 | * Locking for IB rings. | ||
40 | * We assume that allocation is always protected by a mutex | ||
41 | * in the caller (this is a valid assumption for the current | ||
42 | * implementation). | ||
43 | * | ||
44 | * Freeing always happens in an interrupt, and hence only | ||
45 | * races with allocations, but not with other free()s. | ||
46 | * | ||
47 | * The interaction between allocation and freeing is that | ||
48 | * the alloc code has to determine the number of free entries. | ||
49 | * To this end, we maintain two counters; an allocation counter | ||
50 | * and a free counter. Both are allowed to run freely, and wrap | ||
51 | * around. | ||
52 | * The number of used entries is always (alloc_ctr - free_ctr) % NR. | ||
53 | * | ||
54 | * The current implementation makes free_ctr atomic. When the | ||
55 | * caller finds an allocation fails, it should set an "alloc fail" | ||
56 | * bit and retry the allocation. The "alloc fail" bit essentially tells | ||
57 | * the CQ completion handlers to wake it up after freeing some | ||
58 | * more entries. | ||
59 | */ | ||
60 | |||
61 | /* | ||
62 | * This only happens on shutdown. | ||
63 | */ | ||
64 | DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait); | ||
65 | |||
66 | void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr) | ||
67 | { | ||
68 | memset(ring, 0, sizeof(*ring)); | ||
69 | ring->w_nr = nr; | ||
70 | rdsdebug("ring %p nr %u\n", ring, ring->w_nr); | ||
71 | } | ||
72 | |||
73 | static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring) | ||
74 | { | ||
75 | u32 diff; | ||
76 | |||
77 | /* This assumes that atomic_t has at least as many bits as u32 */ | ||
78 | diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); | ||
79 | BUG_ON(diff > ring->w_nr); | ||
80 | |||
81 | return diff; | ||
82 | } | ||
83 | |||
84 | void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr) | ||
85 | { | ||
86 | /* We only ever get called from the connection setup code, | ||
87 | * prior to creating the QP. */ | ||
88 | BUG_ON(__rds_ib_ring_used(ring)); | ||
89 | ring->w_nr = nr; | ||
90 | } | ||
91 | |||
92 | static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring) | ||
93 | { | ||
94 | return __rds_ib_ring_used(ring) == 0; | ||
95 | } | ||
96 | |||
97 | u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos) | ||
98 | { | ||
99 | u32 ret = 0, avail; | ||
100 | |||
101 | avail = ring->w_nr - __rds_ib_ring_used(ring); | ||
102 | |||
103 | rdsdebug("ring %p val %u next %u free %u\n", ring, val, | ||
104 | ring->w_alloc_ptr, avail); | ||
105 | |||
106 | if (val && avail) { | ||
107 | ret = min(val, avail); | ||
108 | *pos = ring->w_alloc_ptr; | ||
109 | |||
110 | ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; | ||
111 | ring->w_alloc_ctr += ret; | ||
112 | } | ||
113 | |||
114 | return ret; | ||
115 | } | ||
116 | |||
117 | void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val) | ||
118 | { | ||
119 | ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; | ||
120 | atomic_add(val, &ring->w_free_ctr); | ||
121 | |||
122 | if (__rds_ib_ring_empty(ring) && | ||
123 | waitqueue_active(&rds_ib_ring_empty_wait)) | ||
124 | wake_up(&rds_ib_ring_empty_wait); | ||
125 | } | ||
126 | |||
127 | void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val) | ||
128 | { | ||
129 | ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; | ||
130 | ring->w_alloc_ctr -= val; | ||
131 | } | ||
132 | |||
133 | int rds_ib_ring_empty(struct rds_ib_work_ring *ring) | ||
134 | { | ||
135 | return __rds_ib_ring_empty(ring); | ||
136 | } | ||
137 | |||
138 | int rds_ib_ring_low(struct rds_ib_work_ring *ring) | ||
139 | { | ||
140 | return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2); | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * returns the oldest alloced ring entry. This will be the next one | ||
145 | * freed. This can't be called if there are none allocated. | ||
146 | */ | ||
147 | u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) | ||
148 | { | ||
149 | return ring->w_free_ptr; | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * returns the number of completed work requests. | ||
154 | */ | ||
155 | |||
156 | u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest) | ||
157 | { | ||
158 | u32 ret; | ||
159 | |||
160 | if (oldest <= (unsigned long long)wr_id) | ||
161 | ret = (unsigned long long)wr_id - oldest + 1; | ||
162 | else | ||
163 | ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; | ||
164 | |||
165 | rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, | ||
166 | wr_id, oldest); | ||
167 | return ret; | ||
168 | } | ||
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c new file mode 100644 index 000000000000..cb6c52cb1c4c --- /dev/null +++ b/net/rds/ib_send.c | |||
@@ -0,0 +1,874 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <linux/device.h> | ||
36 | #include <linux/dmapool.h> | ||
37 | |||
38 | #include "rds.h" | ||
39 | #include "rdma.h" | ||
40 | #include "ib.h" | ||
41 | |||
42 | static void rds_ib_send_rdma_complete(struct rds_message *rm, | ||
43 | int wc_status) | ||
44 | { | ||
45 | int notify_status; | ||
46 | |||
47 | switch (wc_status) { | ||
48 | case IB_WC_WR_FLUSH_ERR: | ||
49 | return; | ||
50 | |||
51 | case IB_WC_SUCCESS: | ||
52 | notify_status = RDS_RDMA_SUCCESS; | ||
53 | break; | ||
54 | |||
55 | case IB_WC_REM_ACCESS_ERR: | ||
56 | notify_status = RDS_RDMA_REMOTE_ERROR; | ||
57 | break; | ||
58 | |||
59 | default: | ||
60 | notify_status = RDS_RDMA_OTHER_ERROR; | ||
61 | break; | ||
62 | } | ||
63 | rds_rdma_send_complete(rm, notify_status); | ||
64 | } | ||
65 | |||
66 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, | ||
67 | struct rds_rdma_op *op) | ||
68 | { | ||
69 | if (op->r_mapped) { | ||
70 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
71 | op->r_sg, op->r_nents, | ||
72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | ||
73 | op->r_mapped = 0; | ||
74 | } | ||
75 | } | ||
76 | |||
77 | static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, | ||
78 | struct rds_ib_send_work *send, | ||
79 | int wc_status) | ||
80 | { | ||
81 | struct rds_message *rm = send->s_rm; | ||
82 | |||
83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | ||
84 | |||
85 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
86 | rm->m_sg, rm->m_nents, | ||
87 | DMA_TO_DEVICE); | ||
88 | |||
89 | if (rm->m_rdma_op != NULL) { | ||
90 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
91 | |||
92 | /* If the user asked for a completion notification on this | ||
93 | * message, we can implement three different semantics: | ||
94 | * 1. Notify when we received the ACK on the RDS message | ||
95 | * that was queued with the RDMA. This provides reliable | ||
96 | * notification of RDMA status at the expense of a one-way | ||
97 | * packet delay. | ||
98 | * 2. Notify when the IB stack gives us the completion event for | ||
99 | * the RDMA operation. | ||
100 | * 3. Notify when the IB stack gives us the completion event for | ||
101 | * the accompanying RDS messages. | ||
102 | * Here, we implement approach #3. To implement approach #2, | ||
103 | * call rds_rdma_send_complete from the cq_handler. To implement #1, | ||
104 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
105 | * handling in the ACK processing code. | ||
106 | * | ||
107 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
108 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
109 | * operation itself unmapped the RDMA buffers, which takes care | ||
110 | * of synching. | ||
111 | */ | ||
112 | rds_ib_send_rdma_complete(rm, wc_status); | ||
113 | |||
114 | if (rm->m_rdma_op->r_write) | ||
115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | ||
116 | else | ||
117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | ||
118 | } | ||
119 | |||
120 | /* If anyone waited for this message to get flushed out, wake | ||
121 | * them up now */ | ||
122 | rds_message_unmapped(rm); | ||
123 | |||
124 | rds_message_put(rm); | ||
125 | send->s_rm = NULL; | ||
126 | } | ||
127 | |||
128 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) | ||
129 | { | ||
130 | struct rds_ib_send_work *send; | ||
131 | u32 i; | ||
132 | |||
133 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | ||
134 | struct ib_sge *sge; | ||
135 | |||
136 | send->s_rm = NULL; | ||
137 | send->s_op = NULL; | ||
138 | |||
139 | send->s_wr.wr_id = i; | ||
140 | send->s_wr.sg_list = send->s_sge; | ||
141 | send->s_wr.num_sge = 1; | ||
142 | send->s_wr.opcode = IB_WR_SEND; | ||
143 | send->s_wr.send_flags = 0; | ||
144 | send->s_wr.ex.imm_data = 0; | ||
145 | |||
146 | sge = rds_ib_data_sge(ic, send->s_sge); | ||
147 | sge->lkey = ic->i_mr->lkey; | ||
148 | |||
149 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
150 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); | ||
151 | sge->length = sizeof(struct rds_header); | ||
152 | sge->lkey = ic->i_mr->lkey; | ||
153 | } | ||
154 | } | ||
155 | |||
156 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic) | ||
157 | { | ||
158 | struct rds_ib_send_work *send; | ||
159 | u32 i; | ||
160 | |||
161 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | ||
162 | if (send->s_wr.opcode == 0xdead) | ||
163 | continue; | ||
164 | if (send->s_rm) | ||
165 | rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); | ||
166 | if (send->s_op) | ||
167 | rds_ib_send_unmap_rdma(ic, send->s_op); | ||
168 | } | ||
169 | } | ||
170 | |||
171 | /* | ||
172 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc | ||
173 | * operations performed in the send path. As the sender allocs and potentially | ||
174 | * unallocs the next free entry in the ring it doesn't alter which is | ||
175 | * the next to be freed, which is what this is concerned with. | ||
176 | */ | ||
177 | void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | ||
178 | { | ||
179 | struct rds_connection *conn = context; | ||
180 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
181 | struct ib_wc wc; | ||
182 | struct rds_ib_send_work *send; | ||
183 | u32 completed; | ||
184 | u32 oldest; | ||
185 | u32 i = 0; | ||
186 | int ret; | ||
187 | |||
188 | rdsdebug("cq %p conn %p\n", cq, conn); | ||
189 | rds_ib_stats_inc(s_ib_tx_cq_call); | ||
190 | ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); | ||
191 | if (ret) | ||
192 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | ||
193 | |||
194 | while (ib_poll_cq(cq, 1, &wc) > 0) { | ||
195 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | ||
196 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | ||
197 | be32_to_cpu(wc.ex.imm_data)); | ||
198 | rds_ib_stats_inc(s_ib_tx_cq_event); | ||
199 | |||
200 | if (wc.wr_id == RDS_IB_ACK_WR_ID) { | ||
201 | if (ic->i_ack_queued + HZ/2 < jiffies) | ||
202 | rds_ib_stats_inc(s_ib_tx_stalled); | ||
203 | rds_ib_ack_send_complete(ic); | ||
204 | continue; | ||
205 | } | ||
206 | |||
207 | oldest = rds_ib_ring_oldest(&ic->i_send_ring); | ||
208 | |||
209 | completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); | ||
210 | |||
211 | for (i = 0; i < completed; i++) { | ||
212 | send = &ic->i_sends[oldest]; | ||
213 | |||
214 | /* In the error case, wc.opcode sometimes contains garbage */ | ||
215 | switch (send->s_wr.opcode) { | ||
216 | case IB_WR_SEND: | ||
217 | if (send->s_rm) | ||
218 | rds_ib_send_unmap_rm(ic, send, wc.status); | ||
219 | break; | ||
220 | case IB_WR_RDMA_WRITE: | ||
221 | case IB_WR_RDMA_READ: | ||
222 | /* Nothing to be done - the SG list will be unmapped | ||
223 | * when the SEND completes. */ | ||
224 | break; | ||
225 | default: | ||
226 | if (printk_ratelimit()) | ||
227 | printk(KERN_NOTICE | ||
228 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
229 | __func__, send->s_wr.opcode); | ||
230 | break; | ||
231 | } | ||
232 | |||
233 | send->s_wr.opcode = 0xdead; | ||
234 | send->s_wr.num_sge = 1; | ||
235 | if (send->s_queued + HZ/2 < jiffies) | ||
236 | rds_ib_stats_inc(s_ib_tx_stalled); | ||
237 | |||
238 | /* If a RDMA operation produced an error, signal this right | ||
239 | * away. If we don't, the subsequent SEND that goes with this | ||
240 | * RDMA will be canceled with ERR_WFLUSH, and the application | ||
241 | * never learn that the RDMA failed. */ | ||
242 | if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { | ||
243 | struct rds_message *rm; | ||
244 | |||
245 | rm = rds_send_get_message(conn, send->s_op); | ||
246 | if (rm) | ||
247 | rds_ib_send_rdma_complete(rm, wc.status); | ||
248 | } | ||
249 | |||
250 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; | ||
251 | } | ||
252 | |||
253 | rds_ib_ring_free(&ic->i_send_ring, completed); | ||
254 | |||
255 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) | ||
256 | || test_bit(0, &conn->c_map_queued)) | ||
257 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
258 | |||
259 | /* We expect errors as the qp is drained during shutdown */ | ||
260 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { | ||
261 | rds_ib_conn_error(conn, | ||
262 | "send completion on %pI4 " | ||
263 | "had status %u, disconnecting and reconnecting\n", | ||
264 | &conn->c_faddr, wc.status); | ||
265 | } | ||
266 | } | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * This is the main function for allocating credits when sending | ||
271 | * messages. | ||
272 | * | ||
273 | * Conceptually, we have two counters: | ||
274 | * - send credits: this tells us how many WRs we're allowed | ||
275 | * to submit without overruning the reciever's queue. For | ||
276 | * each SEND WR we post, we decrement this by one. | ||
277 | * | ||
278 | * - posted credits: this tells us how many WRs we recently | ||
279 | * posted to the receive queue. This value is transferred | ||
280 | * to the peer as a "credit update" in a RDS header field. | ||
281 | * Every time we transmit credits to the peer, we subtract | ||
282 | * the amount of transferred credits from this counter. | ||
283 | * | ||
284 | * It is essential that we avoid situations where both sides have | ||
285 | * exhausted their send credits, and are unable to send new credits | ||
286 | * to the peer. We achieve this by requiring that we send at least | ||
287 | * one credit update to the peer before exhausting our credits. | ||
288 | * When new credits arrive, we subtract one credit that is withheld | ||
289 | * until we've posted new buffers and are ready to transmit these | ||
290 | * credits (see rds_ib_send_add_credits below). | ||
291 | * | ||
292 | * The RDS send code is essentially single-threaded; rds_send_xmit | ||
293 | * grabs c_send_lock to ensure exclusive access to the send ring. | ||
294 | * However, the ACK sending code is independent and can race with | ||
295 | * message SENDs. | ||
296 | * | ||
297 | * In the send path, we need to update the counters for send credits | ||
298 | * and the counter of posted buffers atomically - when we use the | ||
299 | * last available credit, we cannot allow another thread to race us | ||
300 | * and grab the posted credits counter. Hence, we have to use a | ||
301 | * spinlock to protect the credit counter, or use atomics. | ||
302 | * | ||
303 | * Spinlocks shared between the send and the receive path are bad, | ||
304 | * because they create unnecessary delays. An early implementation | ||
305 | * using a spinlock showed a 5% degradation in throughput at some | ||
306 | * loads. | ||
307 | * | ||
308 | * This implementation avoids spinlocks completely, putting both | ||
309 | * counters into a single atomic, and updating that atomic using | ||
310 | * atomic_add (in the receive path, when receiving fresh credits), | ||
311 | * and using atomic_cmpxchg when updating the two counters. | ||
312 | */ | ||
313 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, | ||
314 | u32 wanted, u32 *adv_credits, int need_posted) | ||
315 | { | ||
316 | unsigned int avail, posted, got = 0, advertise; | ||
317 | long oldval, newval; | ||
318 | |||
319 | *adv_credits = 0; | ||
320 | if (!ic->i_flowctl) | ||
321 | return wanted; | ||
322 | |||
323 | try_again: | ||
324 | advertise = 0; | ||
325 | oldval = newval = atomic_read(&ic->i_credits); | ||
326 | posted = IB_GET_POST_CREDITS(oldval); | ||
327 | avail = IB_GET_SEND_CREDITS(oldval); | ||
328 | |||
329 | rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n", | ||
330 | wanted, avail, posted); | ||
331 | |||
332 | /* The last credit must be used to send a credit update. */ | ||
333 | if (avail && !posted) | ||
334 | avail--; | ||
335 | |||
336 | if (avail < wanted) { | ||
337 | struct rds_connection *conn = ic->i_cm_id->context; | ||
338 | |||
339 | /* Oops, there aren't that many credits left! */ | ||
340 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | ||
341 | got = avail; | ||
342 | } else { | ||
343 | /* Sometimes you get what you want, lalala. */ | ||
344 | got = wanted; | ||
345 | } | ||
346 | newval -= IB_SET_SEND_CREDITS(got); | ||
347 | |||
348 | /* | ||
349 | * If need_posted is non-zero, then the caller wants | ||
350 | * the posted regardless of whether any send credits are | ||
351 | * available. | ||
352 | */ | ||
353 | if (posted && (got || need_posted)) { | ||
354 | advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); | ||
355 | newval -= IB_SET_POST_CREDITS(advertise); | ||
356 | } | ||
357 | |||
358 | /* Finally bill everything */ | ||
359 | if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) | ||
360 | goto try_again; | ||
361 | |||
362 | *adv_credits = advertise; | ||
363 | return got; | ||
364 | } | ||
365 | |||
366 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) | ||
367 | { | ||
368 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
369 | |||
370 | if (credits == 0) | ||
371 | return; | ||
372 | |||
373 | rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n", | ||
374 | credits, | ||
375 | IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), | ||
376 | test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); | ||
377 | |||
378 | atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); | ||
379 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) | ||
380 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
381 | |||
382 | WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); | ||
383 | |||
384 | rds_ib_stats_inc(s_ib_rx_credit_updates); | ||
385 | } | ||
386 | |||
387 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) | ||
388 | { | ||
389 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
390 | |||
391 | if (posted == 0) | ||
392 | return; | ||
393 | |||
394 | atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); | ||
395 | |||
396 | /* Decide whether to send an update to the peer now. | ||
397 | * If we would send a credit update for every single buffer we | ||
398 | * post, we would end up with an ACK storm (ACK arrives, | ||
399 | * consumes buffer, we refill the ring, send ACK to remote | ||
400 | * advertising the newly posted buffer... ad inf) | ||
401 | * | ||
402 | * Performance pretty much depends on how often we send | ||
403 | * credit updates - too frequent updates mean lots of ACKs. | ||
404 | * Too infrequent updates, and the peer will run out of | ||
405 | * credits and has to throttle. | ||
406 | * For the time being, 16 seems to be a good compromise. | ||
407 | */ | ||
408 | if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) | ||
409 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
410 | } | ||
411 | |||
412 | static inline void | ||
413 | rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, | ||
414 | struct rds_ib_send_work *send, unsigned int pos, | ||
415 | unsigned long buffer, unsigned int length, | ||
416 | int send_flags) | ||
417 | { | ||
418 | struct ib_sge *sge; | ||
419 | |||
420 | WARN_ON(pos != send - ic->i_sends); | ||
421 | |||
422 | send->s_wr.send_flags = send_flags; | ||
423 | send->s_wr.opcode = IB_WR_SEND; | ||
424 | send->s_wr.num_sge = 2; | ||
425 | send->s_wr.next = NULL; | ||
426 | send->s_queued = jiffies; | ||
427 | send->s_op = NULL; | ||
428 | |||
429 | if (length != 0) { | ||
430 | sge = rds_ib_data_sge(ic, send->s_sge); | ||
431 | sge->addr = buffer; | ||
432 | sge->length = length; | ||
433 | sge->lkey = ic->i_mr->lkey; | ||
434 | |||
435 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
436 | } else { | ||
437 | /* We're sending a packet with no payload. There is only | ||
438 | * one SGE */ | ||
439 | send->s_wr.num_sge = 1; | ||
440 | sge = &send->s_sge[0]; | ||
441 | } | ||
442 | |||
443 | sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); | ||
444 | sge->length = sizeof(struct rds_header); | ||
445 | sge->lkey = ic->i_mr->lkey; | ||
446 | } | ||
447 | |||
448 | /* | ||
449 | * This can be called multiple times for a given message. The first time | ||
450 | * we see a message we map its scatterlist into the IB device so that | ||
451 | * we can provide that mapped address to the IB scatter gather entries | ||
452 | * in the IB work requests. We translate the scatterlist into a series | ||
453 | * of work requests that fragment the message. These work requests complete | ||
454 | * in order so we pass ownership of the message to the completion handler | ||
455 | * once we send the final fragment. | ||
456 | * | ||
457 | * The RDS core uses the c_send_lock to only enter this function once | ||
458 | * per connection. This makes sure that the tx ring alloc/unalloc pairs | ||
459 | * don't get out of sync and confuse the ring. | ||
460 | */ | ||
461 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
462 | unsigned int hdr_off, unsigned int sg, unsigned int off) | ||
463 | { | ||
464 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
465 | struct ib_device *dev = ic->i_cm_id->device; | ||
466 | struct rds_ib_send_work *send = NULL; | ||
467 | struct rds_ib_send_work *first; | ||
468 | struct rds_ib_send_work *prev; | ||
469 | struct ib_send_wr *failed_wr; | ||
470 | struct scatterlist *scat; | ||
471 | u32 pos; | ||
472 | u32 i; | ||
473 | u32 work_alloc; | ||
474 | u32 credit_alloc; | ||
475 | u32 posted; | ||
476 | u32 adv_credits = 0; | ||
477 | int send_flags = 0; | ||
478 | int sent; | ||
479 | int ret; | ||
480 | int flow_controlled = 0; | ||
481 | |||
482 | BUG_ON(off % RDS_FRAG_SIZE); | ||
483 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); | ||
484 | |||
485 | /* FIXME we may overallocate here */ | ||
486 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) | ||
487 | i = 1; | ||
488 | else | ||
489 | i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); | ||
490 | |||
491 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); | ||
492 | if (work_alloc == 0) { | ||
493 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | ||
494 | rds_ib_stats_inc(s_ib_tx_ring_full); | ||
495 | ret = -ENOMEM; | ||
496 | goto out; | ||
497 | } | ||
498 | |||
499 | credit_alloc = work_alloc; | ||
500 | if (ic->i_flowctl) { | ||
501 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0); | ||
502 | adv_credits += posted; | ||
503 | if (credit_alloc < work_alloc) { | ||
504 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); | ||
505 | work_alloc = credit_alloc; | ||
506 | flow_controlled++; | ||
507 | } | ||
508 | if (work_alloc == 0) { | ||
509 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
510 | rds_ib_stats_inc(s_ib_tx_throttle); | ||
511 | ret = -ENOMEM; | ||
512 | goto out; | ||
513 | } | ||
514 | } | ||
515 | |||
516 | /* map the message the first time we see it */ | ||
517 | if (ic->i_rm == NULL) { | ||
518 | /* | ||
519 | printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", | ||
520 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | ||
521 | rm->m_inc.i_hdr.h_flags, | ||
522 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | ||
523 | */ | ||
524 | if (rm->m_nents) { | ||
525 | rm->m_count = ib_dma_map_sg(dev, | ||
526 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | ||
527 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | ||
528 | if (rm->m_count == 0) { | ||
529 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | ||
530 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
531 | ret = -ENOMEM; /* XXX ? */ | ||
532 | goto out; | ||
533 | } | ||
534 | } else { | ||
535 | rm->m_count = 0; | ||
536 | } | ||
537 | |||
538 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
539 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | ||
540 | rds_message_addref(rm); | ||
541 | ic->i_rm = rm; | ||
542 | |||
543 | /* Finalize the header */ | ||
544 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) | ||
545 | rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; | ||
546 | if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) | ||
547 | rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; | ||
548 | |||
549 | /* If it has a RDMA op, tell the peer we did it. This is | ||
550 | * used by the peer to release use-once RDMA MRs. */ | ||
551 | if (rm->m_rdma_op) { | ||
552 | struct rds_ext_header_rdma ext_hdr; | ||
553 | |||
554 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | ||
555 | rds_message_add_extension(&rm->m_inc.i_hdr, | ||
556 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | ||
557 | } | ||
558 | if (rm->m_rdma_cookie) { | ||
559 | rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, | ||
560 | rds_rdma_cookie_key(rm->m_rdma_cookie), | ||
561 | rds_rdma_cookie_offset(rm->m_rdma_cookie)); | ||
562 | } | ||
563 | |||
564 | /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so | ||
565 | * we should not do this unless we have a chance of at least | ||
566 | * sticking the header into the send ring. Which is why we | ||
567 | * should call rds_ib_ring_alloc first. */ | ||
568 | rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic)); | ||
569 | rds_message_make_checksum(&rm->m_inc.i_hdr); | ||
570 | |||
571 | /* | ||
572 | * Update adv_credits since we reset the ACK_REQUIRED bit. | ||
573 | */ | ||
574 | rds_ib_send_grab_credits(ic, 0, &posted, 1); | ||
575 | adv_credits += posted; | ||
576 | BUG_ON(adv_credits > 255); | ||
577 | } else if (ic->i_rm != rm) | ||
578 | BUG(); | ||
579 | |||
580 | send = &ic->i_sends[pos]; | ||
581 | first = send; | ||
582 | prev = NULL; | ||
583 | scat = &rm->m_sg[sg]; | ||
584 | sent = 0; | ||
585 | i = 0; | ||
586 | |||
587 | /* Sometimes you want to put a fence between an RDMA | ||
588 | * READ and the following SEND. | ||
589 | * We could either do this all the time | ||
590 | * or when requested by the user. Right now, we let | ||
591 | * the application choose. | ||
592 | */ | ||
593 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | ||
594 | send_flags = IB_SEND_FENCE; | ||
595 | |||
596 | /* | ||
597 | * We could be copying the header into the unused tail of the page. | ||
598 | * That would need to be changed in the future when those pages might | ||
599 | * be mapped userspace pages or page cache pages. So instead we always | ||
600 | * use a second sge and our long-lived ring of mapped headers. We send | ||
601 | * the header after the data so that the data payload can be aligned on | ||
602 | * the receiver. | ||
603 | */ | ||
604 | |||
605 | /* handle a 0-len message */ | ||
606 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { | ||
607 | rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); | ||
608 | goto add_header; | ||
609 | } | ||
610 | |||
611 | /* if there's data reference it with a chain of work reqs */ | ||
612 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | ||
613 | unsigned int len; | ||
614 | |||
615 | send = &ic->i_sends[pos]; | ||
616 | |||
617 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); | ||
618 | rds_ib_xmit_populate_wr(ic, send, pos, | ||
619 | ib_sg_dma_address(dev, scat) + off, len, | ||
620 | send_flags); | ||
621 | |||
622 | /* | ||
623 | * We want to delay signaling completions just enough to get | ||
624 | * the batching benefits but not so much that we create dead time | ||
625 | * on the wire. | ||
626 | */ | ||
627 | if (ic->i_unsignaled_wrs-- == 0) { | ||
628 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
629 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
630 | } | ||
631 | |||
632 | ic->i_unsignaled_bytes -= len; | ||
633 | if (ic->i_unsignaled_bytes <= 0) { | ||
634 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | ||
635 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
636 | } | ||
637 | |||
638 | /* | ||
639 | * Always signal the last one if we're stopping due to flow control. | ||
640 | */ | ||
641 | if (flow_controlled && i == (work_alloc-1)) | ||
642 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
643 | |||
644 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | ||
645 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | ||
646 | |||
647 | sent += len; | ||
648 | off += len; | ||
649 | if (off == ib_sg_dma_len(dev, scat)) { | ||
650 | scat++; | ||
651 | off = 0; | ||
652 | } | ||
653 | |||
654 | add_header: | ||
655 | /* Tack on the header after the data. The header SGE should already | ||
656 | * have been set up to point to the right header buffer. */ | ||
657 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); | ||
658 | |||
659 | if (0) { | ||
660 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
661 | |||
662 | printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", | ||
663 | be16_to_cpu(hdr->h_dport), | ||
664 | hdr->h_flags, | ||
665 | be32_to_cpu(hdr->h_len)); | ||
666 | } | ||
667 | if (adv_credits) { | ||
668 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
669 | |||
670 | /* add credit and redo the header checksum */ | ||
671 | hdr->h_credit = adv_credits; | ||
672 | rds_message_make_checksum(hdr); | ||
673 | adv_credits = 0; | ||
674 | rds_ib_stats_inc(s_ib_tx_credit_updates); | ||
675 | } | ||
676 | |||
677 | if (prev) | ||
678 | prev->s_wr.next = &send->s_wr; | ||
679 | prev = send; | ||
680 | |||
681 | pos = (pos + 1) % ic->i_send_ring.w_nr; | ||
682 | } | ||
683 | |||
684 | /* Account the RDS header in the number of bytes we sent, but just once. | ||
685 | * The caller has no concept of fragmentation. */ | ||
686 | if (hdr_off == 0) | ||
687 | sent += sizeof(struct rds_header); | ||
688 | |||
689 | /* if we finished the message then send completion owns it */ | ||
690 | if (scat == &rm->m_sg[rm->m_count]) { | ||
691 | prev->s_rm = ic->i_rm; | ||
692 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
693 | ic->i_rm = NULL; | ||
694 | } | ||
695 | |||
696 | if (i < work_alloc) { | ||
697 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | ||
698 | work_alloc = i; | ||
699 | } | ||
700 | if (ic->i_flowctl && i < credit_alloc) | ||
701 | rds_ib_send_add_credits(conn, credit_alloc - i); | ||
702 | |||
703 | /* XXX need to worry about failed_wr and partial sends. */ | ||
704 | failed_wr = &first->s_wr; | ||
705 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | ||
706 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | ||
707 | first, &first->s_wr, ret, failed_wr); | ||
708 | BUG_ON(failed_wr != &first->s_wr); | ||
709 | if (ret) { | ||
710 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " | ||
711 | "returned %d\n", &conn->c_faddr, ret); | ||
712 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
713 | if (prev->s_rm) { | ||
714 | ic->i_rm = prev->s_rm; | ||
715 | prev->s_rm = NULL; | ||
716 | } | ||
717 | /* Finesse this later */ | ||
718 | BUG(); | ||
719 | goto out; | ||
720 | } | ||
721 | |||
722 | ret = sent; | ||
723 | out: | ||
724 | BUG_ON(adv_credits); | ||
725 | return ret; | ||
726 | } | ||
727 | |||
728 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | ||
729 | { | ||
730 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
731 | struct rds_ib_send_work *send = NULL; | ||
732 | struct rds_ib_send_work *first; | ||
733 | struct rds_ib_send_work *prev; | ||
734 | struct ib_send_wr *failed_wr; | ||
735 | struct rds_ib_device *rds_ibdev; | ||
736 | struct scatterlist *scat; | ||
737 | unsigned long len; | ||
738 | u64 remote_addr = op->r_remote_addr; | ||
739 | u32 pos; | ||
740 | u32 work_alloc; | ||
741 | u32 i; | ||
742 | u32 j; | ||
743 | int sent; | ||
744 | int ret; | ||
745 | int num_sge; | ||
746 | |||
747 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||
748 | |||
749 | /* map the message the first time we see it */ | ||
750 | if (!op->r_mapped) { | ||
751 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | ||
752 | op->r_sg, op->r_nents, (op->r_write) ? | ||
753 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | ||
754 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | ||
755 | if (op->r_count == 0) { | ||
756 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | ||
757 | ret = -ENOMEM; /* XXX ? */ | ||
758 | goto out; | ||
759 | } | ||
760 | |||
761 | op->r_mapped = 1; | ||
762 | } | ||
763 | |||
764 | /* | ||
765 | * Instead of knowing how to return a partial rdma read/write we insist that there | ||
766 | * be enough work requests to send the entire message. | ||
767 | */ | ||
768 | i = ceil(op->r_count, rds_ibdev->max_sge); | ||
769 | |||
770 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); | ||
771 | if (work_alloc != i) { | ||
772 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
773 | rds_ib_stats_inc(s_ib_tx_ring_full); | ||
774 | ret = -ENOMEM; | ||
775 | goto out; | ||
776 | } | ||
777 | |||
778 | send = &ic->i_sends[pos]; | ||
779 | first = send; | ||
780 | prev = NULL; | ||
781 | scat = &op->r_sg[0]; | ||
782 | sent = 0; | ||
783 | num_sge = op->r_count; | ||
784 | |||
785 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | ||
786 | send->s_wr.send_flags = 0; | ||
787 | send->s_queued = jiffies; | ||
788 | /* | ||
789 | * We want to delay signaling completions just enough to get | ||
790 | * the batching benefits but not so much that we create dead time on the wire. | ||
791 | */ | ||
792 | if (ic->i_unsignaled_wrs-- == 0) { | ||
793 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
794 | send->s_wr.send_flags = IB_SEND_SIGNALED; | ||
795 | } | ||
796 | |||
797 | send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; | ||
798 | send->s_wr.wr.rdma.remote_addr = remote_addr; | ||
799 | send->s_wr.wr.rdma.rkey = op->r_key; | ||
800 | send->s_op = op; | ||
801 | |||
802 | if (num_sge > rds_ibdev->max_sge) { | ||
803 | send->s_wr.num_sge = rds_ibdev->max_sge; | ||
804 | num_sge -= rds_ibdev->max_sge; | ||
805 | } else { | ||
806 | send->s_wr.num_sge = num_sge; | ||
807 | } | ||
808 | |||
809 | send->s_wr.next = NULL; | ||
810 | |||
811 | if (prev) | ||
812 | prev->s_wr.next = &send->s_wr; | ||
813 | |||
814 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | ||
815 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | ||
816 | send->s_sge[j].addr = | ||
817 | ib_sg_dma_address(ic->i_cm_id->device, scat); | ||
818 | send->s_sge[j].length = len; | ||
819 | send->s_sge[j].lkey = ic->i_mr->lkey; | ||
820 | |||
821 | sent += len; | ||
822 | rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); | ||
823 | |||
824 | remote_addr += len; | ||
825 | scat++; | ||
826 | } | ||
827 | |||
828 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | ||
829 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | ||
830 | |||
831 | prev = send; | ||
832 | if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) | ||
833 | send = ic->i_sends; | ||
834 | } | ||
835 | |||
836 | /* if we finished the message then send completion owns it */ | ||
837 | if (scat == &op->r_sg[op->r_count]) | ||
838 | prev->s_wr.send_flags = IB_SEND_SIGNALED; | ||
839 | |||
840 | if (i < work_alloc) { | ||
841 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | ||
842 | work_alloc = i; | ||
843 | } | ||
844 | |||
845 | failed_wr = &first->s_wr; | ||
846 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | ||
847 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | ||
848 | first, &first->s_wr, ret, failed_wr); | ||
849 | BUG_ON(failed_wr != &first->s_wr); | ||
850 | if (ret) { | ||
851 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " | ||
852 | "returned %d\n", &conn->c_faddr, ret); | ||
853 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
854 | goto out; | ||
855 | } | ||
856 | |||
857 | if (unlikely(failed_wr != &first->s_wr)) { | ||
858 | printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); | ||
859 | BUG_ON(failed_wr != &first->s_wr); | ||
860 | } | ||
861 | |||
862 | |||
863 | out: | ||
864 | return ret; | ||
865 | } | ||
866 | |||
867 | void rds_ib_xmit_complete(struct rds_connection *conn) | ||
868 | { | ||
869 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
870 | |||
871 | /* We may have a pending ACK or window update we were unable | ||
872 | * to send previously (due to flow control). Try again. */ | ||
873 | rds_ib_attempt_ack(ic); | ||
874 | } | ||
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c new file mode 100644 index 000000000000..02e3e3d50d4a --- /dev/null +++ b/net/rds/ib_stats.c | |||
@@ -0,0 +1,95 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/percpu.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "ib.h" | ||
39 | |||
40 | DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; | ||
41 | |||
42 | static char *rds_ib_stat_names[] = { | ||
43 | "ib_connect_raced", | ||
44 | "ib_listen_closed_stale", | ||
45 | "ib_tx_cq_call", | ||
46 | "ib_tx_cq_event", | ||
47 | "ib_tx_ring_full", | ||
48 | "ib_tx_throttle", | ||
49 | "ib_tx_sg_mapping_failure", | ||
50 | "ib_tx_stalled", | ||
51 | "ib_tx_credit_updates", | ||
52 | "ib_rx_cq_call", | ||
53 | "ib_rx_cq_event", | ||
54 | "ib_rx_ring_empty", | ||
55 | "ib_rx_refill_from_cq", | ||
56 | "ib_rx_refill_from_thread", | ||
57 | "ib_rx_alloc_limit", | ||
58 | "ib_rx_credit_updates", | ||
59 | "ib_ack_sent", | ||
60 | "ib_ack_send_failure", | ||
61 | "ib_ack_send_delayed", | ||
62 | "ib_ack_send_piggybacked", | ||
63 | "ib_ack_received", | ||
64 | "ib_rdma_mr_alloc", | ||
65 | "ib_rdma_mr_free", | ||
66 | "ib_rdma_mr_used", | ||
67 | "ib_rdma_mr_pool_flush", | ||
68 | "ib_rdma_mr_pool_wait", | ||
69 | "ib_rdma_mr_pool_depleted", | ||
70 | }; | ||
71 | |||
72 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | ||
73 | unsigned int avail) | ||
74 | { | ||
75 | struct rds_ib_statistics stats = {0, }; | ||
76 | uint64_t *src; | ||
77 | uint64_t *sum; | ||
78 | size_t i; | ||
79 | int cpu; | ||
80 | |||
81 | if (avail < ARRAY_SIZE(rds_ib_stat_names)) | ||
82 | goto out; | ||
83 | |||
84 | for_each_online_cpu(cpu) { | ||
85 | src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); | ||
86 | sum = (uint64_t *)&stats; | ||
87 | for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) | ||
88 | *(sum++) += *(src++); | ||
89 | } | ||
90 | |||
91 | rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, | ||
92 | ARRAY_SIZE(rds_ib_stat_names)); | ||
93 | out: | ||
94 | return ARRAY_SIZE(rds_ib_stat_names); | ||
95 | } | ||
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c new file mode 100644 index 000000000000..d87830db93a0 --- /dev/null +++ b/net/rds/ib_sysctl.c | |||
@@ -0,0 +1,137 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/sysctl.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "ib.h" | ||
38 | |||
39 | static struct ctl_table_header *rds_ib_sysctl_hdr; | ||
40 | |||
41 | unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR; | ||
42 | unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR; | ||
43 | unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; | ||
44 | static unsigned long rds_ib_sysctl_max_wr_min = 1; | ||
45 | /* hardware will fail CQ creation long before this */ | ||
46 | static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0; | ||
47 | |||
48 | unsigned long rds_ib_sysctl_max_unsig_wrs = 16; | ||
49 | static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; | ||
50 | static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; | ||
51 | |||
52 | unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); | ||
53 | static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; | ||
54 | static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; | ||
55 | |||
56 | unsigned int rds_ib_sysctl_flow_control = 1; | ||
57 | |||
58 | ctl_table rds_ib_sysctl_table[] = { | ||
59 | { | ||
60 | .ctl_name = CTL_UNNUMBERED, | ||
61 | .procname = "max_send_wr", | ||
62 | .data = &rds_ib_sysctl_max_send_wr, | ||
63 | .maxlen = sizeof(unsigned long), | ||
64 | .mode = 0644, | ||
65 | .proc_handler = &proc_doulongvec_minmax, | ||
66 | .extra1 = &rds_ib_sysctl_max_wr_min, | ||
67 | .extra2 = &rds_ib_sysctl_max_wr_max, | ||
68 | }, | ||
69 | { | ||
70 | .ctl_name = CTL_UNNUMBERED, | ||
71 | .procname = "max_recv_wr", | ||
72 | .data = &rds_ib_sysctl_max_recv_wr, | ||
73 | .maxlen = sizeof(unsigned long), | ||
74 | .mode = 0644, | ||
75 | .proc_handler = &proc_doulongvec_minmax, | ||
76 | .extra1 = &rds_ib_sysctl_max_wr_min, | ||
77 | .extra2 = &rds_ib_sysctl_max_wr_max, | ||
78 | }, | ||
79 | { | ||
80 | .ctl_name = CTL_UNNUMBERED, | ||
81 | .procname = "max_unsignaled_wr", | ||
82 | .data = &rds_ib_sysctl_max_unsig_wrs, | ||
83 | .maxlen = sizeof(unsigned long), | ||
84 | .mode = 0644, | ||
85 | .proc_handler = &proc_doulongvec_minmax, | ||
86 | .extra1 = &rds_ib_sysctl_max_unsig_wr_min, | ||
87 | .extra2 = &rds_ib_sysctl_max_unsig_wr_max, | ||
88 | }, | ||
89 | { | ||
90 | .ctl_name = CTL_UNNUMBERED, | ||
91 | .procname = "max_unsignaled_bytes", | ||
92 | .data = &rds_ib_sysctl_max_unsig_bytes, | ||
93 | .maxlen = sizeof(unsigned long), | ||
94 | .mode = 0644, | ||
95 | .proc_handler = &proc_doulongvec_minmax, | ||
96 | .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, | ||
97 | .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, | ||
98 | }, | ||
99 | { | ||
100 | .ctl_name = CTL_UNNUMBERED, | ||
101 | .procname = "max_recv_allocation", | ||
102 | .data = &rds_ib_sysctl_max_recv_allocation, | ||
103 | .maxlen = sizeof(unsigned long), | ||
104 | .mode = 0644, | ||
105 | .proc_handler = &proc_doulongvec_minmax, | ||
106 | }, | ||
107 | { | ||
108 | .ctl_name = CTL_UNNUMBERED, | ||
109 | .procname = "flow_control", | ||
110 | .data = &rds_ib_sysctl_flow_control, | ||
111 | .maxlen = sizeof(rds_ib_sysctl_flow_control), | ||
112 | .mode = 0644, | ||
113 | .proc_handler = &proc_dointvec, | ||
114 | }, | ||
115 | { .ctl_name = 0} | ||
116 | }; | ||
117 | |||
118 | static struct ctl_path rds_ib_sysctl_path[] = { | ||
119 | { .procname = "net", .ctl_name = CTL_NET, }, | ||
120 | { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, | ||
121 | { .procname = "ib", .ctl_name = CTL_UNNUMBERED, }, | ||
122 | { } | ||
123 | }; | ||
124 | |||
125 | void rds_ib_sysctl_exit(void) | ||
126 | { | ||
127 | if (rds_ib_sysctl_hdr) | ||
128 | unregister_sysctl_table(rds_ib_sysctl_hdr); | ||
129 | } | ||
130 | |||
131 | int __init rds_ib_sysctl_init(void) | ||
132 | { | ||
133 | rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); | ||
134 | if (rds_ib_sysctl_hdr == NULL) | ||
135 | return -ENOMEM; | ||
136 | return 0; | ||
137 | } | ||
diff --git a/net/rds/info.c b/net/rds/info.c new file mode 100644 index 000000000000..1d885535214d --- /dev/null +++ b/net/rds/info.c | |||
@@ -0,0 +1,241 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/percpu.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | |||
39 | /* | ||
40 | * This file implements a getsockopt() call which copies a set of fixed | ||
41 | * sized structs into a user-specified buffer as a means of providing | ||
42 | * read-only information about RDS. | ||
43 | * | ||
44 | * For a given information source there are a given number of fixed sized | ||
45 | * structs at a given time. The structs are only copied if the user-specified | ||
46 | * buffer is big enough. The destination pages that make up the buffer | ||
47 | * are pinned for the duration of the copy. | ||
48 | * | ||
49 | * This gives us the following benefits: | ||
50 | * | ||
51 | * - simple implementation, no copy "position" across multiple calls | ||
52 | * - consistent snapshot of an info source | ||
53 | * - atomic copy works well with whatever locking info source has | ||
54 | * - one portable tool to get rds info across implementations | ||
55 | * - long-lived tool can get info without allocating | ||
56 | * | ||
57 | * at the following costs: | ||
58 | * | ||
59 | * - info source copy must be pinned, may be "large" | ||
60 | */ | ||
61 | |||
62 | struct rds_info_iterator { | ||
63 | struct page **pages; | ||
64 | void *addr; | ||
65 | unsigned long offset; | ||
66 | }; | ||
67 | |||
68 | static DEFINE_SPINLOCK(rds_info_lock); | ||
69 | static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; | ||
70 | |||
71 | void rds_info_register_func(int optname, rds_info_func func) | ||
72 | { | ||
73 | int offset = optname - RDS_INFO_FIRST; | ||
74 | |||
75 | BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); | ||
76 | |||
77 | spin_lock(&rds_info_lock); | ||
78 | BUG_ON(rds_info_funcs[offset] != NULL); | ||
79 | rds_info_funcs[offset] = func; | ||
80 | spin_unlock(&rds_info_lock); | ||
81 | } | ||
82 | |||
83 | void rds_info_deregister_func(int optname, rds_info_func func) | ||
84 | { | ||
85 | int offset = optname - RDS_INFO_FIRST; | ||
86 | |||
87 | BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); | ||
88 | |||
89 | spin_lock(&rds_info_lock); | ||
90 | BUG_ON(rds_info_funcs[offset] != func); | ||
91 | rds_info_funcs[offset] = NULL; | ||
92 | spin_unlock(&rds_info_lock); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * Typically we hold an atomic kmap across multiple rds_info_copy() calls | ||
97 | * because the kmap is so expensive. This must be called before using blocking | ||
98 | * operations while holding the mapping and as the iterator is torn down. | ||
99 | */ | ||
100 | void rds_info_iter_unmap(struct rds_info_iterator *iter) | ||
101 | { | ||
102 | if (iter->addr != NULL) { | ||
103 | kunmap_atomic(iter->addr, KM_USER0); | ||
104 | iter->addr = NULL; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * get_user_pages() called flush_dcache_page() on the pages for us. | ||
110 | */ | ||
111 | void rds_info_copy(struct rds_info_iterator *iter, void *data, | ||
112 | unsigned long bytes) | ||
113 | { | ||
114 | unsigned long this; | ||
115 | |||
116 | while (bytes) { | ||
117 | if (iter->addr == NULL) | ||
118 | iter->addr = kmap_atomic(*iter->pages, KM_USER0); | ||
119 | |||
120 | this = min(bytes, PAGE_SIZE - iter->offset); | ||
121 | |||
122 | rdsdebug("page %p addr %p offset %lu this %lu data %p " | ||
123 | "bytes %lu\n", *iter->pages, iter->addr, | ||
124 | iter->offset, this, data, bytes); | ||
125 | |||
126 | memcpy(iter->addr + iter->offset, data, this); | ||
127 | |||
128 | data += this; | ||
129 | bytes -= this; | ||
130 | iter->offset += this; | ||
131 | |||
132 | if (iter->offset == PAGE_SIZE) { | ||
133 | kunmap_atomic(iter->addr, KM_USER0); | ||
134 | iter->addr = NULL; | ||
135 | iter->offset = 0; | ||
136 | iter->pages++; | ||
137 | } | ||
138 | } | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * @optval points to the userspace buffer that the information snapshot | ||
143 | * will be copied into. | ||
144 | * | ||
145 | * @optlen on input is the size of the buffer in userspace. @optlen | ||
146 | * on output is the size of the requested snapshot in bytes. | ||
147 | * | ||
148 | * This function returns -errno if there is a failure, particularly -ENOSPC | ||
149 | * if the given userspace buffer was not large enough to fit the snapshot. | ||
150 | * On success it returns the positive number of bytes of each array element | ||
151 | * in the snapshot. | ||
152 | */ | ||
153 | int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, | ||
154 | int __user *optlen) | ||
155 | { | ||
156 | struct rds_info_iterator iter; | ||
157 | struct rds_info_lengths lens; | ||
158 | unsigned long nr_pages = 0; | ||
159 | unsigned long start; | ||
160 | unsigned long i; | ||
161 | rds_info_func func; | ||
162 | struct page **pages = NULL; | ||
163 | int ret; | ||
164 | int len; | ||
165 | int total; | ||
166 | |||
167 | if (get_user(len, optlen)) { | ||
168 | ret = -EFAULT; | ||
169 | goto out; | ||
170 | } | ||
171 | |||
172 | /* check for all kinds of wrapping and the like */ | ||
173 | start = (unsigned long)optval; | ||
174 | if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { | ||
175 | ret = -EINVAL; | ||
176 | goto out; | ||
177 | } | ||
178 | |||
179 | /* a 0 len call is just trying to probe its length */ | ||
180 | if (len == 0) | ||
181 | goto call_func; | ||
182 | |||
183 | nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) | ||
184 | >> PAGE_SHIFT; | ||
185 | |||
186 | pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); | ||
187 | if (pages == NULL) { | ||
188 | ret = -ENOMEM; | ||
189 | goto out; | ||
190 | } | ||
191 | down_read(¤t->mm->mmap_sem); | ||
192 | ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0, | ||
193 | pages, NULL); | ||
194 | up_read(¤t->mm->mmap_sem); | ||
195 | if (ret != nr_pages) { | ||
196 | if (ret > 0) | ||
197 | nr_pages = ret; | ||
198 | else | ||
199 | nr_pages = 0; | ||
200 | ret = -EAGAIN; /* XXX ? */ | ||
201 | goto out; | ||
202 | } | ||
203 | |||
204 | rdsdebug("len %d nr_pages %lu\n", len, nr_pages); | ||
205 | |||
206 | call_func: | ||
207 | func = rds_info_funcs[optname - RDS_INFO_FIRST]; | ||
208 | if (func == NULL) { | ||
209 | ret = -ENOPROTOOPT; | ||
210 | goto out; | ||
211 | } | ||
212 | |||
213 | iter.pages = pages; | ||
214 | iter.addr = NULL; | ||
215 | iter.offset = start & (PAGE_SIZE - 1); | ||
216 | |||
217 | func(sock, len, &iter, &lens); | ||
218 | BUG_ON(lens.each == 0); | ||
219 | |||
220 | total = lens.nr * lens.each; | ||
221 | |||
222 | rds_info_iter_unmap(&iter); | ||
223 | |||
224 | if (total > len) { | ||
225 | len = total; | ||
226 | ret = -ENOSPC; | ||
227 | } else { | ||
228 | len = total; | ||
229 | ret = lens.each; | ||
230 | } | ||
231 | |||
232 | if (put_user(len, optlen)) | ||
233 | ret = -EFAULT; | ||
234 | |||
235 | out: | ||
236 | for (i = 0; pages != NULL && i < nr_pages; i++) | ||
237 | put_page(pages[i]); | ||
238 | kfree(pages); | ||
239 | |||
240 | return ret; | ||
241 | } | ||
diff --git a/net/rds/info.h b/net/rds/info.h new file mode 100644 index 000000000000..b6c052ca7d22 --- /dev/null +++ b/net/rds/info.h | |||
@@ -0,0 +1,30 @@ | |||
1 | #ifndef _RDS_INFO_H | ||
2 | #define _RDS_INFO_H | ||
3 | |||
4 | struct rds_info_lengths { | ||
5 | unsigned int nr; | ||
6 | unsigned int each; | ||
7 | }; | ||
8 | |||
9 | struct rds_info_iterator; | ||
10 | |||
11 | /* | ||
12 | * These functions must fill in the fields of @lens to reflect the size | ||
13 | * of the available info source. If the snapshot fits in @len then it | ||
14 | * should be copied using @iter. The caller will deduce if it was copied | ||
15 | * or not by comparing the lengths. | ||
16 | */ | ||
17 | typedef void (*rds_info_func)(struct socket *sock, unsigned int len, | ||
18 | struct rds_info_iterator *iter, | ||
19 | struct rds_info_lengths *lens); | ||
20 | |||
21 | void rds_info_register_func(int optname, rds_info_func func); | ||
22 | void rds_info_deregister_func(int optname, rds_info_func func); | ||
23 | int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, | ||
24 | int __user *optlen); | ||
25 | void rds_info_copy(struct rds_info_iterator *iter, void *data, | ||
26 | unsigned long bytes); | ||
27 | void rds_info_iter_unmap(struct rds_info_iterator *iter); | ||
28 | |||
29 | |||
30 | #endif | ||
diff --git a/net/rds/iw.c b/net/rds/iw.c new file mode 100644 index 000000000000..1b56905c4c08 --- /dev/null +++ b/net/rds/iw.c | |||
@@ -0,0 +1,333 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <linux/if.h> | ||
36 | #include <linux/netdevice.h> | ||
37 | #include <linux/inetdevice.h> | ||
38 | #include <linux/if_arp.h> | ||
39 | #include <linux/delay.h> | ||
40 | |||
41 | #include "rds.h" | ||
42 | #include "iw.h" | ||
43 | |||
44 | unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; | ||
45 | unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ | ||
46 | |||
47 | module_param(fastreg_pool_size, int, 0444); | ||
48 | MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); | ||
49 | module_param(fastreg_message_size, int, 0444); | ||
50 | MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); | ||
51 | |||
52 | struct list_head rds_iw_devices; | ||
53 | |||
54 | DEFINE_SPINLOCK(iw_nodev_conns_lock); | ||
55 | LIST_HEAD(iw_nodev_conns); | ||
56 | |||
57 | void rds_iw_add_one(struct ib_device *device) | ||
58 | { | ||
59 | struct rds_iw_device *rds_iwdev; | ||
60 | struct ib_device_attr *dev_attr; | ||
61 | |||
62 | /* Only handle iwarp devices */ | ||
63 | if (device->node_type != RDMA_NODE_RNIC) | ||
64 | return; | ||
65 | |||
66 | dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); | ||
67 | if (!dev_attr) | ||
68 | return; | ||
69 | |||
70 | if (ib_query_device(device, dev_attr)) { | ||
71 | rdsdebug("Query device failed for %s\n", device->name); | ||
72 | goto free_attr; | ||
73 | } | ||
74 | |||
75 | rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); | ||
76 | if (!rds_iwdev) | ||
77 | goto free_attr; | ||
78 | |||
79 | spin_lock_init(&rds_iwdev->spinlock); | ||
80 | |||
81 | rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); | ||
82 | rds_iwdev->max_wrs = dev_attr->max_qp_wr; | ||
83 | rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); | ||
84 | |||
85 | rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1); | ||
86 | |||
87 | rds_iwdev->dev = device; | ||
88 | rds_iwdev->pd = ib_alloc_pd(device); | ||
89 | if (IS_ERR(rds_iwdev->pd)) | ||
90 | goto free_dev; | ||
91 | |||
92 | if (!rds_iwdev->dma_local_lkey) { | ||
93 | if (device->node_type != RDMA_NODE_RNIC) { | ||
94 | rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, | ||
95 | IB_ACCESS_LOCAL_WRITE); | ||
96 | } else { | ||
97 | rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, | ||
98 | IB_ACCESS_REMOTE_READ | | ||
99 | IB_ACCESS_REMOTE_WRITE | | ||
100 | IB_ACCESS_LOCAL_WRITE); | ||
101 | } | ||
102 | if (IS_ERR(rds_iwdev->mr)) | ||
103 | goto err_pd; | ||
104 | } else | ||
105 | rds_iwdev->mr = NULL; | ||
106 | |||
107 | rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); | ||
108 | if (IS_ERR(rds_iwdev->mr_pool)) { | ||
109 | rds_iwdev->mr_pool = NULL; | ||
110 | goto err_mr; | ||
111 | } | ||
112 | |||
113 | INIT_LIST_HEAD(&rds_iwdev->cm_id_list); | ||
114 | INIT_LIST_HEAD(&rds_iwdev->conn_list); | ||
115 | list_add_tail(&rds_iwdev->list, &rds_iw_devices); | ||
116 | |||
117 | ib_set_client_data(device, &rds_iw_client, rds_iwdev); | ||
118 | |||
119 | goto free_attr; | ||
120 | |||
121 | err_mr: | ||
122 | if (rds_iwdev->mr) | ||
123 | ib_dereg_mr(rds_iwdev->mr); | ||
124 | err_pd: | ||
125 | ib_dealloc_pd(rds_iwdev->pd); | ||
126 | free_dev: | ||
127 | kfree(rds_iwdev); | ||
128 | free_attr: | ||
129 | kfree(dev_attr); | ||
130 | } | ||
131 | |||
132 | void rds_iw_remove_one(struct ib_device *device) | ||
133 | { | ||
134 | struct rds_iw_device *rds_iwdev; | ||
135 | struct rds_iw_cm_id *i_cm_id, *next; | ||
136 | |||
137 | rds_iwdev = ib_get_client_data(device, &rds_iw_client); | ||
138 | if (!rds_iwdev) | ||
139 | return; | ||
140 | |||
141 | spin_lock_irq(&rds_iwdev->spinlock); | ||
142 | list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { | ||
143 | list_del(&i_cm_id->list); | ||
144 | kfree(i_cm_id); | ||
145 | } | ||
146 | spin_unlock_irq(&rds_iwdev->spinlock); | ||
147 | |||
148 | rds_iw_remove_conns(rds_iwdev); | ||
149 | |||
150 | if (rds_iwdev->mr_pool) | ||
151 | rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); | ||
152 | |||
153 | if (rds_iwdev->mr) | ||
154 | ib_dereg_mr(rds_iwdev->mr); | ||
155 | |||
156 | while (ib_dealloc_pd(rds_iwdev->pd)) { | ||
157 | rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); | ||
158 | msleep(1); | ||
159 | } | ||
160 | |||
161 | list_del(&rds_iwdev->list); | ||
162 | kfree(rds_iwdev); | ||
163 | } | ||
164 | |||
165 | struct ib_client rds_iw_client = { | ||
166 | .name = "rds_iw", | ||
167 | .add = rds_iw_add_one, | ||
168 | .remove = rds_iw_remove_one | ||
169 | }; | ||
170 | |||
171 | static int rds_iw_conn_info_visitor(struct rds_connection *conn, | ||
172 | void *buffer) | ||
173 | { | ||
174 | struct rds_info_rdma_connection *iinfo = buffer; | ||
175 | struct rds_iw_connection *ic; | ||
176 | |||
177 | /* We will only ever look at IB transports */ | ||
178 | if (conn->c_trans != &rds_iw_transport) | ||
179 | return 0; | ||
180 | |||
181 | iinfo->src_addr = conn->c_laddr; | ||
182 | iinfo->dst_addr = conn->c_faddr; | ||
183 | |||
184 | memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); | ||
185 | memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); | ||
186 | if (rds_conn_state(conn) == RDS_CONN_UP) { | ||
187 | struct rds_iw_device *rds_iwdev; | ||
188 | struct rdma_dev_addr *dev_addr; | ||
189 | |||
190 | ic = conn->c_transport_data; | ||
191 | dev_addr = &ic->i_cm_id->route.addr.dev_addr; | ||
192 | |||
193 | ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); | ||
194 | ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); | ||
195 | |||
196 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); | ||
197 | iinfo->max_send_wr = ic->i_send_ring.w_nr; | ||
198 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; | ||
199 | iinfo->max_send_sge = rds_iwdev->max_sge; | ||
200 | rds_iw_get_mr_info(rds_iwdev, iinfo); | ||
201 | } | ||
202 | return 1; | ||
203 | } | ||
204 | |||
205 | static void rds_iw_ic_info(struct socket *sock, unsigned int len, | ||
206 | struct rds_info_iterator *iter, | ||
207 | struct rds_info_lengths *lens) | ||
208 | { | ||
209 | rds_for_each_conn_info(sock, len, iter, lens, | ||
210 | rds_iw_conn_info_visitor, | ||
211 | sizeof(struct rds_info_rdma_connection)); | ||
212 | } | ||
213 | |||
214 | |||
215 | /* | ||
216 | * Early RDS/IB was built to only bind to an address if there is an IPoIB | ||
217 | * device with that address set. | ||
218 | * | ||
219 | * If it were me, I'd advocate for something more flexible. Sending and | ||
220 | * receiving should be device-agnostic. Transports would try and maintain | ||
221 | * connections between peers who have messages queued. Userspace would be | ||
222 | * allowed to influence which paths have priority. We could call userspace | ||
223 | * asserting this policy "routing". | ||
224 | */ | ||
225 | static int rds_iw_laddr_check(__be32 addr) | ||
226 | { | ||
227 | int ret; | ||
228 | struct rdma_cm_id *cm_id; | ||
229 | struct sockaddr_in sin; | ||
230 | |||
231 | /* Create a CMA ID and try to bind it. This catches both | ||
232 | * IB and iWARP capable NICs. | ||
233 | */ | ||
234 | cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); | ||
235 | if (!cm_id) | ||
236 | return -EADDRNOTAVAIL; | ||
237 | |||
238 | memset(&sin, 0, sizeof(sin)); | ||
239 | sin.sin_family = AF_INET; | ||
240 | sin.sin_addr.s_addr = addr; | ||
241 | |||
242 | /* rdma_bind_addr will only succeed for IB & iWARP devices */ | ||
243 | ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); | ||
244 | /* due to this, we will claim to support IB devices unless we | ||
245 | check node_type. */ | ||
246 | if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) | ||
247 | ret = -EADDRNOTAVAIL; | ||
248 | |||
249 | rdsdebug("addr %pI4 ret %d node type %d\n", | ||
250 | &addr, ret, | ||
251 | cm_id->device ? cm_id->device->node_type : -1); | ||
252 | |||
253 | rdma_destroy_id(cm_id); | ||
254 | |||
255 | return ret; | ||
256 | } | ||
257 | |||
258 | void rds_iw_exit(void) | ||
259 | { | ||
260 | rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); | ||
261 | rds_iw_remove_nodev_conns(); | ||
262 | ib_unregister_client(&rds_iw_client); | ||
263 | rds_iw_sysctl_exit(); | ||
264 | rds_iw_recv_exit(); | ||
265 | rds_trans_unregister(&rds_iw_transport); | ||
266 | } | ||
267 | |||
268 | struct rds_transport rds_iw_transport = { | ||
269 | .laddr_check = rds_iw_laddr_check, | ||
270 | .xmit_complete = rds_iw_xmit_complete, | ||
271 | .xmit = rds_iw_xmit, | ||
272 | .xmit_cong_map = NULL, | ||
273 | .xmit_rdma = rds_iw_xmit_rdma, | ||
274 | .recv = rds_iw_recv, | ||
275 | .conn_alloc = rds_iw_conn_alloc, | ||
276 | .conn_free = rds_iw_conn_free, | ||
277 | .conn_connect = rds_iw_conn_connect, | ||
278 | .conn_shutdown = rds_iw_conn_shutdown, | ||
279 | .inc_copy_to_user = rds_iw_inc_copy_to_user, | ||
280 | .inc_purge = rds_iw_inc_purge, | ||
281 | .inc_free = rds_iw_inc_free, | ||
282 | .cm_initiate_connect = rds_iw_cm_initiate_connect, | ||
283 | .cm_handle_connect = rds_iw_cm_handle_connect, | ||
284 | .cm_connect_complete = rds_iw_cm_connect_complete, | ||
285 | .stats_info_copy = rds_iw_stats_info_copy, | ||
286 | .exit = rds_iw_exit, | ||
287 | .get_mr = rds_iw_get_mr, | ||
288 | .sync_mr = rds_iw_sync_mr, | ||
289 | .free_mr = rds_iw_free_mr, | ||
290 | .flush_mrs = rds_iw_flush_mrs, | ||
291 | .t_owner = THIS_MODULE, | ||
292 | .t_name = "iwarp", | ||
293 | .t_prefer_loopback = 1, | ||
294 | }; | ||
295 | |||
296 | int __init rds_iw_init(void) | ||
297 | { | ||
298 | int ret; | ||
299 | |||
300 | INIT_LIST_HEAD(&rds_iw_devices); | ||
301 | |||
302 | ret = ib_register_client(&rds_iw_client); | ||
303 | if (ret) | ||
304 | goto out; | ||
305 | |||
306 | ret = rds_iw_sysctl_init(); | ||
307 | if (ret) | ||
308 | goto out_ibreg; | ||
309 | |||
310 | ret = rds_iw_recv_init(); | ||
311 | if (ret) | ||
312 | goto out_sysctl; | ||
313 | |||
314 | ret = rds_trans_register(&rds_iw_transport); | ||
315 | if (ret) | ||
316 | goto out_recv; | ||
317 | |||
318 | rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); | ||
319 | |||
320 | goto out; | ||
321 | |||
322 | out_recv: | ||
323 | rds_iw_recv_exit(); | ||
324 | out_sysctl: | ||
325 | rds_iw_sysctl_exit(); | ||
326 | out_ibreg: | ||
327 | ib_unregister_client(&rds_iw_client); | ||
328 | out: | ||
329 | return ret; | ||
330 | } | ||
331 | |||
332 | MODULE_LICENSE("GPL"); | ||
333 | |||
diff --git a/net/rds/iw.h b/net/rds/iw.h new file mode 100644 index 000000000000..0ddda34f2a1c --- /dev/null +++ b/net/rds/iw.h | |||
@@ -0,0 +1,395 @@ | |||
1 | #ifndef _RDS_IW_H | ||
2 | #define _RDS_IW_H | ||
3 | |||
4 | #include <rdma/ib_verbs.h> | ||
5 | #include <rdma/rdma_cm.h> | ||
6 | #include "rds.h" | ||
7 | #include "rdma_transport.h" | ||
8 | |||
9 | #define RDS_FASTREG_SIZE 20 | ||
10 | #define RDS_FASTREG_POOL_SIZE 2048 | ||
11 | |||
12 | #define RDS_IW_MAX_SGE 8 | ||
13 | #define RDS_IW_RECV_SGE 2 | ||
14 | |||
15 | #define RDS_IW_DEFAULT_RECV_WR 1024 | ||
16 | #define RDS_IW_DEFAULT_SEND_WR 256 | ||
17 | |||
18 | #define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ | ||
19 | |||
20 | extern struct list_head rds_iw_devices; | ||
21 | |||
22 | /* | ||
23 | * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to | ||
24 | * try and minimize the amount of memory tied up both the device and | ||
25 | * socket receive queues. | ||
26 | */ | ||
27 | /* page offset of the final full frag that fits in the page */ | ||
28 | #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) | ||
29 | struct rds_page_frag { | ||
30 | struct list_head f_item; | ||
31 | struct page *f_page; | ||
32 | unsigned long f_offset; | ||
33 | dma_addr_t f_mapped; | ||
34 | }; | ||
35 | |||
36 | struct rds_iw_incoming { | ||
37 | struct list_head ii_frags; | ||
38 | struct rds_incoming ii_inc; | ||
39 | }; | ||
40 | |||
41 | struct rds_iw_connect_private { | ||
42 | /* Add new fields at the end, and don't permute existing fields. */ | ||
43 | __be32 dp_saddr; | ||
44 | __be32 dp_daddr; | ||
45 | u8 dp_protocol_major; | ||
46 | u8 dp_protocol_minor; | ||
47 | __be16 dp_protocol_minor_mask; /* bitmask */ | ||
48 | __be32 dp_reserved1; | ||
49 | __be64 dp_ack_seq; | ||
50 | __be32 dp_credit; /* non-zero enables flow ctl */ | ||
51 | }; | ||
52 | |||
53 | struct rds_iw_scatterlist { | ||
54 | struct scatterlist *list; | ||
55 | unsigned int len; | ||
56 | int dma_len; | ||
57 | unsigned int dma_npages; | ||
58 | unsigned int bytes; | ||
59 | }; | ||
60 | |||
61 | struct rds_iw_mapping { | ||
62 | spinlock_t m_lock; /* protect the mapping struct */ | ||
63 | struct list_head m_list; | ||
64 | struct rds_iw_mr *m_mr; | ||
65 | uint32_t m_rkey; | ||
66 | struct rds_iw_scatterlist m_sg; | ||
67 | }; | ||
68 | |||
69 | struct rds_iw_send_work { | ||
70 | struct rds_message *s_rm; | ||
71 | |||
72 | /* We should really put these into a union: */ | ||
73 | struct rds_rdma_op *s_op; | ||
74 | struct rds_iw_mapping *s_mapping; | ||
75 | struct ib_mr *s_mr; | ||
76 | struct ib_fast_reg_page_list *s_page_list; | ||
77 | unsigned char s_remap_count; | ||
78 | |||
79 | struct ib_send_wr s_wr; | ||
80 | struct ib_sge s_sge[RDS_IW_MAX_SGE]; | ||
81 | unsigned long s_queued; | ||
82 | }; | ||
83 | |||
84 | struct rds_iw_recv_work { | ||
85 | struct rds_iw_incoming *r_iwinc; | ||
86 | struct rds_page_frag *r_frag; | ||
87 | struct ib_recv_wr r_wr; | ||
88 | struct ib_sge r_sge[2]; | ||
89 | }; | ||
90 | |||
91 | struct rds_iw_work_ring { | ||
92 | u32 w_nr; | ||
93 | u32 w_alloc_ptr; | ||
94 | u32 w_alloc_ctr; | ||
95 | u32 w_free_ptr; | ||
96 | atomic_t w_free_ctr; | ||
97 | }; | ||
98 | |||
99 | struct rds_iw_device; | ||
100 | |||
101 | struct rds_iw_connection { | ||
102 | |||
103 | struct list_head iw_node; | ||
104 | struct rds_iw_device *rds_iwdev; | ||
105 | struct rds_connection *conn; | ||
106 | |||
107 | /* alphabet soup, IBTA style */ | ||
108 | struct rdma_cm_id *i_cm_id; | ||
109 | struct ib_pd *i_pd; | ||
110 | struct ib_mr *i_mr; | ||
111 | struct ib_cq *i_send_cq; | ||
112 | struct ib_cq *i_recv_cq; | ||
113 | |||
114 | /* tx */ | ||
115 | struct rds_iw_work_ring i_send_ring; | ||
116 | struct rds_message *i_rm; | ||
117 | struct rds_header *i_send_hdrs; | ||
118 | u64 i_send_hdrs_dma; | ||
119 | struct rds_iw_send_work *i_sends; | ||
120 | |||
121 | /* rx */ | ||
122 | struct mutex i_recv_mutex; | ||
123 | struct rds_iw_work_ring i_recv_ring; | ||
124 | struct rds_iw_incoming *i_iwinc; | ||
125 | u32 i_recv_data_rem; | ||
126 | struct rds_header *i_recv_hdrs; | ||
127 | u64 i_recv_hdrs_dma; | ||
128 | struct rds_iw_recv_work *i_recvs; | ||
129 | struct rds_page_frag i_frag; | ||
130 | u64 i_ack_recv; /* last ACK received */ | ||
131 | |||
132 | /* sending acks */ | ||
133 | unsigned long i_ack_flags; | ||
134 | u64 i_ack_next; /* next ACK to send */ | ||
135 | struct rds_header *i_ack; | ||
136 | struct ib_send_wr i_ack_wr; | ||
137 | struct ib_sge i_ack_sge; | ||
138 | u64 i_ack_dma; | ||
139 | unsigned long i_ack_queued; | ||
140 | |||
141 | /* Flow control related information | ||
142 | * | ||
143 | * Our algorithm uses a pair variables that we need to access | ||
144 | * atomically - one for the send credits, and one posted | ||
145 | * recv credits we need to transfer to remote. | ||
146 | * Rather than protect them using a slow spinlock, we put both into | ||
147 | * a single atomic_t and update it using cmpxchg | ||
148 | */ | ||
149 | atomic_t i_credits; | ||
150 | |||
151 | /* Protocol version specific information */ | ||
152 | unsigned int i_flowctl:1; /* enable/disable flow ctl */ | ||
153 | unsigned int i_dma_local_lkey:1; | ||
154 | unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ | ||
155 | /* Batched completions */ | ||
156 | unsigned int i_unsignaled_wrs; | ||
157 | long i_unsignaled_bytes; | ||
158 | }; | ||
159 | |||
160 | /* This assumes that atomic_t is at least 32 bits */ | ||
161 | #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) | ||
162 | #define IB_GET_POST_CREDITS(v) ((v) >> 16) | ||
163 | #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) | ||
164 | #define IB_SET_POST_CREDITS(v) ((v) << 16) | ||
165 | |||
166 | struct rds_iw_cm_id { | ||
167 | struct list_head list; | ||
168 | struct rdma_cm_id *cm_id; | ||
169 | }; | ||
170 | |||
171 | struct rds_iw_device { | ||
172 | struct list_head list; | ||
173 | struct list_head cm_id_list; | ||
174 | struct list_head conn_list; | ||
175 | struct ib_device *dev; | ||
176 | struct ib_pd *pd; | ||
177 | struct ib_mr *mr; | ||
178 | struct rds_iw_mr_pool *mr_pool; | ||
179 | int page_shift; | ||
180 | int max_sge; | ||
181 | unsigned int max_wrs; | ||
182 | unsigned int dma_local_lkey:1; | ||
183 | spinlock_t spinlock; /* protect the above */ | ||
184 | }; | ||
185 | |||
186 | /* bits for i_ack_flags */ | ||
187 | #define IB_ACK_IN_FLIGHT 0 | ||
188 | #define IB_ACK_REQUESTED 1 | ||
189 | |||
190 | /* Magic WR_ID for ACKs */ | ||
191 | #define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) | ||
192 | #define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL) | ||
193 | #define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) | ||
194 | |||
195 | struct rds_iw_statistics { | ||
196 | uint64_t s_iw_connect_raced; | ||
197 | uint64_t s_iw_listen_closed_stale; | ||
198 | uint64_t s_iw_tx_cq_call; | ||
199 | uint64_t s_iw_tx_cq_event; | ||
200 | uint64_t s_iw_tx_ring_full; | ||
201 | uint64_t s_iw_tx_throttle; | ||
202 | uint64_t s_iw_tx_sg_mapping_failure; | ||
203 | uint64_t s_iw_tx_stalled; | ||
204 | uint64_t s_iw_tx_credit_updates; | ||
205 | uint64_t s_iw_rx_cq_call; | ||
206 | uint64_t s_iw_rx_cq_event; | ||
207 | uint64_t s_iw_rx_ring_empty; | ||
208 | uint64_t s_iw_rx_refill_from_cq; | ||
209 | uint64_t s_iw_rx_refill_from_thread; | ||
210 | uint64_t s_iw_rx_alloc_limit; | ||
211 | uint64_t s_iw_rx_credit_updates; | ||
212 | uint64_t s_iw_ack_sent; | ||
213 | uint64_t s_iw_ack_send_failure; | ||
214 | uint64_t s_iw_ack_send_delayed; | ||
215 | uint64_t s_iw_ack_send_piggybacked; | ||
216 | uint64_t s_iw_ack_received; | ||
217 | uint64_t s_iw_rdma_mr_alloc; | ||
218 | uint64_t s_iw_rdma_mr_free; | ||
219 | uint64_t s_iw_rdma_mr_used; | ||
220 | uint64_t s_iw_rdma_mr_pool_flush; | ||
221 | uint64_t s_iw_rdma_mr_pool_wait; | ||
222 | uint64_t s_iw_rdma_mr_pool_depleted; | ||
223 | }; | ||
224 | |||
225 | extern struct workqueue_struct *rds_iw_wq; | ||
226 | |||
227 | /* | ||
228 | * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h | ||
229 | * doesn't define it. | ||
230 | */ | ||
231 | static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, | ||
232 | struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||
233 | { | ||
234 | unsigned int i; | ||
235 | |||
236 | for (i = 0; i < sg_dma_len; ++i) { | ||
237 | ib_dma_sync_single_for_cpu(dev, | ||
238 | ib_sg_dma_address(dev, &sg[i]), | ||
239 | ib_sg_dma_len(dev, &sg[i]), | ||
240 | direction); | ||
241 | } | ||
242 | } | ||
243 | #define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu | ||
244 | |||
245 | static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, | ||
246 | struct scatterlist *sg, unsigned int sg_dma_len, int direction) | ||
247 | { | ||
248 | unsigned int i; | ||
249 | |||
250 | for (i = 0; i < sg_dma_len; ++i) { | ||
251 | ib_dma_sync_single_for_device(dev, | ||
252 | ib_sg_dma_address(dev, &sg[i]), | ||
253 | ib_sg_dma_len(dev, &sg[i]), | ||
254 | direction); | ||
255 | } | ||
256 | } | ||
257 | #define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device | ||
258 | |||
259 | static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) | ||
260 | { | ||
261 | return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; | ||
262 | } | ||
263 | |||
264 | /* ib.c */ | ||
265 | extern struct rds_transport rds_iw_transport; | ||
266 | extern void rds_iw_add_one(struct ib_device *device); | ||
267 | extern void rds_iw_remove_one(struct ib_device *device); | ||
268 | extern struct ib_client rds_iw_client; | ||
269 | |||
270 | extern unsigned int fastreg_pool_size; | ||
271 | extern unsigned int fastreg_message_size; | ||
272 | |||
273 | extern spinlock_t iw_nodev_conns_lock; | ||
274 | extern struct list_head iw_nodev_conns; | ||
275 | |||
276 | /* ib_cm.c */ | ||
277 | int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); | ||
278 | void rds_iw_conn_free(void *arg); | ||
279 | int rds_iw_conn_connect(struct rds_connection *conn); | ||
280 | void rds_iw_conn_shutdown(struct rds_connection *conn); | ||
281 | void rds_iw_state_change(struct sock *sk); | ||
282 | int __init rds_iw_listen_init(void); | ||
283 | void rds_iw_listen_stop(void); | ||
284 | void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); | ||
285 | int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, | ||
286 | struct rdma_cm_event *event); | ||
287 | int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); | ||
288 | void rds_iw_cm_connect_complete(struct rds_connection *conn, | ||
289 | struct rdma_cm_event *event); | ||
290 | |||
291 | |||
292 | #define rds_iw_conn_error(conn, fmt...) \ | ||
293 | __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) | ||
294 | |||
295 | /* ib_rdma.c */ | ||
296 | int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); | ||
297 | int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); | ||
298 | void rds_iw_remove_nodev_conns(void); | ||
299 | void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev); | ||
300 | struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); | ||
301 | void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); | ||
302 | void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); | ||
303 | void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, | ||
304 | struct rds_sock *rs, u32 *key_ret); | ||
305 | void rds_iw_sync_mr(void *trans_private, int dir); | ||
306 | void rds_iw_free_mr(void *trans_private, int invalidate); | ||
307 | void rds_iw_flush_mrs(void); | ||
308 | void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); | ||
309 | |||
310 | /* ib_recv.c */ | ||
311 | int __init rds_iw_recv_init(void); | ||
312 | void rds_iw_recv_exit(void); | ||
313 | int rds_iw_recv(struct rds_connection *conn); | ||
314 | int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | ||
315 | gfp_t page_gfp, int prefill); | ||
316 | void rds_iw_inc_purge(struct rds_incoming *inc); | ||
317 | void rds_iw_inc_free(struct rds_incoming *inc); | ||
318 | int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | ||
319 | size_t size); | ||
320 | void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); | ||
321 | void rds_iw_recv_init_ring(struct rds_iw_connection *ic); | ||
322 | void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); | ||
323 | void rds_iw_recv_init_ack(struct rds_iw_connection *ic); | ||
324 | void rds_iw_attempt_ack(struct rds_iw_connection *ic); | ||
325 | void rds_iw_ack_send_complete(struct rds_iw_connection *ic); | ||
326 | u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); | ||
327 | |||
328 | /* ib_ring.c */ | ||
329 | void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); | ||
330 | void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); | ||
331 | u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); | ||
332 | void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); | ||
333 | void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); | ||
334 | int rds_iw_ring_empty(struct rds_iw_work_ring *ring); | ||
335 | int rds_iw_ring_low(struct rds_iw_work_ring *ring); | ||
336 | u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); | ||
337 | u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); | ||
338 | extern wait_queue_head_t rds_iw_ring_empty_wait; | ||
339 | |||
340 | /* ib_send.c */ | ||
341 | void rds_iw_xmit_complete(struct rds_connection *conn); | ||
342 | int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
343 | unsigned int hdr_off, unsigned int sg, unsigned int off); | ||
344 | void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); | ||
345 | void rds_iw_send_init_ring(struct rds_iw_connection *ic); | ||
346 | void rds_iw_send_clear_ring(struct rds_iw_connection *ic); | ||
347 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | ||
348 | void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); | ||
349 | void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); | ||
350 | int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, | ||
351 | u32 *adv_credits, int need_posted); | ||
352 | |||
353 | /* ib_stats.c */ | ||
354 | DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); | ||
355 | #define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) | ||
356 | unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, | ||
357 | unsigned int avail); | ||
358 | |||
359 | /* ib_sysctl.c */ | ||
360 | int __init rds_iw_sysctl_init(void); | ||
361 | void rds_iw_sysctl_exit(void); | ||
362 | extern unsigned long rds_iw_sysctl_max_send_wr; | ||
363 | extern unsigned long rds_iw_sysctl_max_recv_wr; | ||
364 | extern unsigned long rds_iw_sysctl_max_unsig_wrs; | ||
365 | extern unsigned long rds_iw_sysctl_max_unsig_bytes; | ||
366 | extern unsigned long rds_iw_sysctl_max_recv_allocation; | ||
367 | extern unsigned int rds_iw_sysctl_flow_control; | ||
368 | extern ctl_table rds_iw_sysctl_table[]; | ||
369 | |||
370 | /* | ||
371 | * Helper functions for getting/setting the header and data SGEs in | ||
372 | * RDS packets (not RDMA) | ||
373 | */ | ||
374 | static inline struct ib_sge * | ||
375 | rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) | ||
376 | { | ||
377 | return &sge[0]; | ||
378 | } | ||
379 | |||
380 | static inline struct ib_sge * | ||
381 | rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) | ||
382 | { | ||
383 | return &sge[1]; | ||
384 | } | ||
385 | |||
386 | static inline void rds_iw_set_64bit(u64 *ptr, u64 val) | ||
387 | { | ||
388 | #if BITS_PER_LONG == 64 | ||
389 | *ptr = val; | ||
390 | #else | ||
391 | set_64bit(ptr, val); | ||
392 | #endif | ||
393 | } | ||
394 | |||
395 | #endif | ||
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c new file mode 100644 index 000000000000..57ecb3d4b8a5 --- /dev/null +++ b/net/rds/iw_cm.c | |||
@@ -0,0 +1,750 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <linux/vmalloc.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "iw.h" | ||
39 | |||
40 | /* | ||
41 | * Set the selected protocol version | ||
42 | */ | ||
43 | static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) | ||
44 | { | ||
45 | conn->c_version = version; | ||
46 | } | ||
47 | |||
48 | /* | ||
49 | * Set up flow control | ||
50 | */ | ||
51 | static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) | ||
52 | { | ||
53 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
54 | |||
55 | if (rds_iw_sysctl_flow_control && credits != 0) { | ||
56 | /* We're doing flow control */ | ||
57 | ic->i_flowctl = 1; | ||
58 | rds_iw_send_add_credits(conn, credits); | ||
59 | } else { | ||
60 | ic->i_flowctl = 0; | ||
61 | } | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * Connection established. | ||
66 | * We get here for both outgoing and incoming connection. | ||
67 | */ | ||
68 | void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) | ||
69 | { | ||
70 | const struct rds_iw_connect_private *dp = NULL; | ||
71 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
72 | struct rds_iw_device *rds_iwdev; | ||
73 | int err; | ||
74 | |||
75 | if (event->param.conn.private_data_len) { | ||
76 | dp = event->param.conn.private_data; | ||
77 | |||
78 | rds_iw_set_protocol(conn, | ||
79 | RDS_PROTOCOL(dp->dp_protocol_major, | ||
80 | dp->dp_protocol_minor)); | ||
81 | rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); | ||
82 | } | ||
83 | |||
84 | /* update ib_device with this local ipaddr & conn */ | ||
85 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); | ||
86 | err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); | ||
87 | if (err) | ||
88 | printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); | ||
89 | err = rds_iw_add_conn(rds_iwdev, conn); | ||
90 | if (err) | ||
91 | printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err); | ||
92 | |||
93 | /* If the peer gave us the last packet it saw, process this as if | ||
94 | * we had received a regular ACK. */ | ||
95 | if (dp && dp->dp_ack_seq) | ||
96 | rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); | ||
97 | |||
98 | printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", | ||
99 | &conn->c_laddr, &conn->c_faddr, | ||
100 | RDS_PROTOCOL_MAJOR(conn->c_version), | ||
101 | RDS_PROTOCOL_MINOR(conn->c_version), | ||
102 | ic->i_flowctl ? ", flow control" : ""); | ||
103 | |||
104 | rds_connect_complete(conn); | ||
105 | } | ||
106 | |||
107 | static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, | ||
108 | struct rdma_conn_param *conn_param, | ||
109 | struct rds_iw_connect_private *dp, | ||
110 | u32 protocol_version) | ||
111 | { | ||
112 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
113 | |||
114 | memset(conn_param, 0, sizeof(struct rdma_conn_param)); | ||
115 | /* XXX tune these? */ | ||
116 | conn_param->responder_resources = 1; | ||
117 | conn_param->initiator_depth = 1; | ||
118 | |||
119 | if (dp) { | ||
120 | memset(dp, 0, sizeof(*dp)); | ||
121 | dp->dp_saddr = conn->c_laddr; | ||
122 | dp->dp_daddr = conn->c_faddr; | ||
123 | dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); | ||
124 | dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); | ||
125 | dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); | ||
126 | dp->dp_ack_seq = rds_iw_piggyb_ack(ic); | ||
127 | |||
128 | /* Advertise flow control */ | ||
129 | if (ic->i_flowctl) { | ||
130 | unsigned int credits; | ||
131 | |||
132 | credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); | ||
133 | dp->dp_credit = cpu_to_be32(credits); | ||
134 | atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); | ||
135 | } | ||
136 | |||
137 | conn_param->private_data = dp; | ||
138 | conn_param->private_data_len = sizeof(*dp); | ||
139 | } | ||
140 | } | ||
141 | |||
142 | static void rds_iw_cq_event_handler(struct ib_event *event, void *data) | ||
143 | { | ||
144 | rdsdebug("event %u data %p\n", event->event, data); | ||
145 | } | ||
146 | |||
147 | static void rds_iw_qp_event_handler(struct ib_event *event, void *data) | ||
148 | { | ||
149 | struct rds_connection *conn = data; | ||
150 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
151 | |||
152 | rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); | ||
153 | |||
154 | switch (event->event) { | ||
155 | case IB_EVENT_COMM_EST: | ||
156 | rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); | ||
157 | break; | ||
158 | case IB_EVENT_QP_REQ_ERR: | ||
159 | case IB_EVENT_QP_FATAL: | ||
160 | default: | ||
161 | rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n", | ||
162 | event->event, &conn->c_laddr, | ||
163 | &conn->c_faddr); | ||
164 | break; | ||
165 | } | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | * Create a QP | ||
170 | */ | ||
171 | static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, | ||
172 | struct rds_iw_device *rds_iwdev, | ||
173 | struct rds_iw_work_ring *send_ring, | ||
174 | void (*send_cq_handler)(struct ib_cq *, void *), | ||
175 | struct rds_iw_work_ring *recv_ring, | ||
176 | void (*recv_cq_handler)(struct ib_cq *, void *), | ||
177 | void *context) | ||
178 | { | ||
179 | struct ib_device *dev = rds_iwdev->dev; | ||
180 | unsigned int send_size, recv_size; | ||
181 | int ret; | ||
182 | |||
183 | /* The offset of 1 is to accomodate the additional ACK WR. */ | ||
184 | send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); | ||
185 | recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); | ||
186 | rds_iw_ring_resize(send_ring, send_size - 1); | ||
187 | rds_iw_ring_resize(recv_ring, recv_size - 1); | ||
188 | |||
189 | memset(attr, 0, sizeof(*attr)); | ||
190 | attr->event_handler = rds_iw_qp_event_handler; | ||
191 | attr->qp_context = context; | ||
192 | attr->cap.max_send_wr = send_size; | ||
193 | attr->cap.max_recv_wr = recv_size; | ||
194 | attr->cap.max_send_sge = rds_iwdev->max_sge; | ||
195 | attr->cap.max_recv_sge = RDS_IW_RECV_SGE; | ||
196 | attr->sq_sig_type = IB_SIGNAL_REQ_WR; | ||
197 | attr->qp_type = IB_QPT_RC; | ||
198 | |||
199 | attr->send_cq = ib_create_cq(dev, send_cq_handler, | ||
200 | rds_iw_cq_event_handler, | ||
201 | context, send_size, 0); | ||
202 | if (IS_ERR(attr->send_cq)) { | ||
203 | ret = PTR_ERR(attr->send_cq); | ||
204 | attr->send_cq = NULL; | ||
205 | rdsdebug("ib_create_cq send failed: %d\n", ret); | ||
206 | goto out; | ||
207 | } | ||
208 | |||
209 | attr->recv_cq = ib_create_cq(dev, recv_cq_handler, | ||
210 | rds_iw_cq_event_handler, | ||
211 | context, recv_size, 0); | ||
212 | if (IS_ERR(attr->recv_cq)) { | ||
213 | ret = PTR_ERR(attr->recv_cq); | ||
214 | attr->recv_cq = NULL; | ||
215 | rdsdebug("ib_create_cq send failed: %d\n", ret); | ||
216 | goto out; | ||
217 | } | ||
218 | |||
219 | ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); | ||
220 | if (ret) { | ||
221 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | ||
222 | goto out; | ||
223 | } | ||
224 | |||
225 | ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); | ||
226 | if (ret) { | ||
227 | rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); | ||
228 | goto out; | ||
229 | } | ||
230 | |||
231 | out: | ||
232 | if (ret) { | ||
233 | if (attr->send_cq) | ||
234 | ib_destroy_cq(attr->send_cq); | ||
235 | if (attr->recv_cq) | ||
236 | ib_destroy_cq(attr->recv_cq); | ||
237 | } | ||
238 | return ret; | ||
239 | } | ||
240 | |||
241 | /* | ||
242 | * This needs to be very careful to not leave IS_ERR pointers around for | ||
243 | * cleanup to trip over. | ||
244 | */ | ||
245 | static int rds_iw_setup_qp(struct rds_connection *conn) | ||
246 | { | ||
247 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
248 | struct ib_device *dev = ic->i_cm_id->device; | ||
249 | struct ib_qp_init_attr attr; | ||
250 | struct rds_iw_device *rds_iwdev; | ||
251 | int ret; | ||
252 | |||
253 | /* rds_iw_add_one creates a rds_iw_device object per IB device, | ||
254 | * and allocates a protection domain, memory range and MR pool | ||
255 | * for each. If that fails for any reason, it will not register | ||
256 | * the rds_iwdev at all. | ||
257 | */ | ||
258 | rds_iwdev = ib_get_client_data(dev, &rds_iw_client); | ||
259 | if (rds_iwdev == NULL) { | ||
260 | if (printk_ratelimit()) | ||
261 | printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", | ||
262 | dev->name); | ||
263 | return -EOPNOTSUPP; | ||
264 | } | ||
265 | |||
266 | /* Protection domain and memory range */ | ||
267 | ic->i_pd = rds_iwdev->pd; | ||
268 | ic->i_mr = rds_iwdev->mr; | ||
269 | |||
270 | ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, | ||
271 | &ic->i_send_ring, rds_iw_send_cq_comp_handler, | ||
272 | &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, | ||
273 | conn); | ||
274 | if (ret < 0) | ||
275 | goto out; | ||
276 | |||
277 | ic->i_send_cq = attr.send_cq; | ||
278 | ic->i_recv_cq = attr.recv_cq; | ||
279 | |||
280 | /* | ||
281 | * XXX this can fail if max_*_wr is too large? Are we supposed | ||
282 | * to back off until we get a value that the hardware can support? | ||
283 | */ | ||
284 | ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); | ||
285 | if (ret) { | ||
286 | rdsdebug("rdma_create_qp failed: %d\n", ret); | ||
287 | goto out; | ||
288 | } | ||
289 | |||
290 | ic->i_send_hdrs = ib_dma_alloc_coherent(dev, | ||
291 | ic->i_send_ring.w_nr * | ||
292 | sizeof(struct rds_header), | ||
293 | &ic->i_send_hdrs_dma, GFP_KERNEL); | ||
294 | if (ic->i_send_hdrs == NULL) { | ||
295 | ret = -ENOMEM; | ||
296 | rdsdebug("ib_dma_alloc_coherent send failed\n"); | ||
297 | goto out; | ||
298 | } | ||
299 | |||
300 | ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, | ||
301 | ic->i_recv_ring.w_nr * | ||
302 | sizeof(struct rds_header), | ||
303 | &ic->i_recv_hdrs_dma, GFP_KERNEL); | ||
304 | if (ic->i_recv_hdrs == NULL) { | ||
305 | ret = -ENOMEM; | ||
306 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); | ||
307 | goto out; | ||
308 | } | ||
309 | |||
310 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), | ||
311 | &ic->i_ack_dma, GFP_KERNEL); | ||
312 | if (ic->i_ack == NULL) { | ||
313 | ret = -ENOMEM; | ||
314 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); | ||
315 | goto out; | ||
316 | } | ||
317 | |||
318 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); | ||
319 | if (ic->i_sends == NULL) { | ||
320 | ret = -ENOMEM; | ||
321 | rdsdebug("send allocation failed\n"); | ||
322 | goto out; | ||
323 | } | ||
324 | rds_iw_send_init_ring(ic); | ||
325 | |||
326 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); | ||
327 | if (ic->i_recvs == NULL) { | ||
328 | ret = -ENOMEM; | ||
329 | rdsdebug("recv allocation failed\n"); | ||
330 | goto out; | ||
331 | } | ||
332 | |||
333 | rds_iw_recv_init_ring(ic); | ||
334 | rds_iw_recv_init_ack(ic); | ||
335 | |||
336 | /* Post receive buffers - as a side effect, this will update | ||
337 | * the posted credit count. */ | ||
338 | rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); | ||
339 | |||
340 | rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, | ||
341 | ic->i_send_cq, ic->i_recv_cq); | ||
342 | |||
343 | out: | ||
344 | return ret; | ||
345 | } | ||
346 | |||
347 | static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) | ||
348 | { | ||
349 | u16 common; | ||
350 | u32 version = 0; | ||
351 | |||
352 | /* rdma_cm private data is odd - when there is any private data in the | ||
353 | * request, we will be given a pretty large buffer without telling us the | ||
354 | * original size. The only way to tell the difference is by looking at | ||
355 | * the contents, which are initialized to zero. | ||
356 | * If the protocol version fields aren't set, this is a connection attempt | ||
357 | * from an older version. This could could be 3.0 or 2.0 - we can't tell. | ||
358 | * We really should have changed this for OFED 1.3 :-( */ | ||
359 | if (dp->dp_protocol_major == 0) | ||
360 | return RDS_PROTOCOL_3_0; | ||
361 | |||
362 | common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; | ||
363 | if (dp->dp_protocol_major == 3 && common) { | ||
364 | version = RDS_PROTOCOL_3_0; | ||
365 | while ((common >>= 1) != 0) | ||
366 | version++; | ||
367 | } else if (printk_ratelimit()) { | ||
368 | printk(KERN_NOTICE "RDS: Connection from %pI4 using " | ||
369 | "incompatible protocol version %u.%u\n", | ||
370 | &dp->dp_saddr, | ||
371 | dp->dp_protocol_major, | ||
372 | dp->dp_protocol_minor); | ||
373 | } | ||
374 | return version; | ||
375 | } | ||
376 | |||
377 | int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, | ||
378 | struct rdma_cm_event *event) | ||
379 | { | ||
380 | const struct rds_iw_connect_private *dp = event->param.conn.private_data; | ||
381 | struct rds_iw_connect_private dp_rep; | ||
382 | struct rds_connection *conn = NULL; | ||
383 | struct rds_iw_connection *ic = NULL; | ||
384 | struct rdma_conn_param conn_param; | ||
385 | struct rds_iw_device *rds_iwdev; | ||
386 | u32 version; | ||
387 | int err, destroy = 1; | ||
388 | |||
389 | /* Check whether the remote protocol version matches ours. */ | ||
390 | version = rds_iw_protocol_compatible(dp); | ||
391 | if (!version) | ||
392 | goto out; | ||
393 | |||
394 | rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", | ||
395 | &dp->dp_saddr, &dp->dp_daddr, | ||
396 | RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); | ||
397 | |||
398 | conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, | ||
399 | GFP_KERNEL); | ||
400 | if (IS_ERR(conn)) { | ||
401 | rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); | ||
402 | conn = NULL; | ||
403 | goto out; | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * The connection request may occur while the | ||
408 | * previous connection exist, e.g. in case of failover. | ||
409 | * But as connections may be initiated simultaneously | ||
410 | * by both hosts, we have a random backoff mechanism - | ||
411 | * see the comment above rds_queue_reconnect() | ||
412 | */ | ||
413 | mutex_lock(&conn->c_cm_lock); | ||
414 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { | ||
415 | if (rds_conn_state(conn) == RDS_CONN_UP) { | ||
416 | rdsdebug("incoming connect while connecting\n"); | ||
417 | rds_conn_drop(conn); | ||
418 | rds_iw_stats_inc(s_iw_listen_closed_stale); | ||
419 | } else | ||
420 | if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { | ||
421 | /* Wait and see - our connect may still be succeeding */ | ||
422 | rds_iw_stats_inc(s_iw_connect_raced); | ||
423 | } | ||
424 | mutex_unlock(&conn->c_cm_lock); | ||
425 | goto out; | ||
426 | } | ||
427 | |||
428 | ic = conn->c_transport_data; | ||
429 | |||
430 | rds_iw_set_protocol(conn, version); | ||
431 | rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); | ||
432 | |||
433 | /* If the peer gave us the last packet it saw, process this as if | ||
434 | * we had received a regular ACK. */ | ||
435 | if (dp->dp_ack_seq) | ||
436 | rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); | ||
437 | |||
438 | BUG_ON(cm_id->context); | ||
439 | BUG_ON(ic->i_cm_id); | ||
440 | |||
441 | ic->i_cm_id = cm_id; | ||
442 | cm_id->context = conn; | ||
443 | |||
444 | rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); | ||
445 | ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; | ||
446 | |||
447 | /* We got halfway through setting up the ib_connection, if we | ||
448 | * fail now, we have to take the long route out of this mess. */ | ||
449 | destroy = 0; | ||
450 | |||
451 | err = rds_iw_setup_qp(conn); | ||
452 | if (err) { | ||
453 | rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); | ||
454 | goto out; | ||
455 | } | ||
456 | |||
457 | rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); | ||
458 | |||
459 | /* rdma_accept() calls rdma_reject() internally if it fails */ | ||
460 | err = rdma_accept(cm_id, &conn_param); | ||
461 | mutex_unlock(&conn->c_cm_lock); | ||
462 | if (err) { | ||
463 | rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); | ||
464 | goto out; | ||
465 | } | ||
466 | |||
467 | return 0; | ||
468 | |||
469 | out: | ||
470 | rdma_reject(cm_id, NULL, 0); | ||
471 | return destroy; | ||
472 | } | ||
473 | |||
474 | |||
475 | int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) | ||
476 | { | ||
477 | struct rds_connection *conn = cm_id->context; | ||
478 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
479 | struct rdma_conn_param conn_param; | ||
480 | struct rds_iw_connect_private dp; | ||
481 | int ret; | ||
482 | |||
483 | /* If the peer doesn't do protocol negotiation, we must | ||
484 | * default to RDSv3.0 */ | ||
485 | rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); | ||
486 | ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ | ||
487 | |||
488 | ret = rds_iw_setup_qp(conn); | ||
489 | if (ret) { | ||
490 | rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); | ||
491 | goto out; | ||
492 | } | ||
493 | |||
494 | rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); | ||
495 | |||
496 | ret = rdma_connect(cm_id, &conn_param); | ||
497 | if (ret) | ||
498 | rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); | ||
499 | |||
500 | out: | ||
501 | /* Beware - returning non-zero tells the rdma_cm to destroy | ||
502 | * the cm_id. We should certainly not do it as long as we still | ||
503 | * "own" the cm_id. */ | ||
504 | if (ret) { | ||
505 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
506 | |||
507 | if (ic->i_cm_id == cm_id) | ||
508 | ret = 0; | ||
509 | } | ||
510 | return ret; | ||
511 | } | ||
512 | |||
513 | int rds_iw_conn_connect(struct rds_connection *conn) | ||
514 | { | ||
515 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
516 | struct rds_iw_device *rds_iwdev; | ||
517 | struct sockaddr_in src, dest; | ||
518 | int ret; | ||
519 | |||
520 | /* XXX I wonder what affect the port space has */ | ||
521 | /* delegate cm event handler to rdma_transport */ | ||
522 | ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, | ||
523 | RDMA_PS_TCP); | ||
524 | if (IS_ERR(ic->i_cm_id)) { | ||
525 | ret = PTR_ERR(ic->i_cm_id); | ||
526 | ic->i_cm_id = NULL; | ||
527 | rdsdebug("rdma_create_id() failed: %d\n", ret); | ||
528 | goto out; | ||
529 | } | ||
530 | |||
531 | rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); | ||
532 | |||
533 | src.sin_family = AF_INET; | ||
534 | src.sin_addr.s_addr = (__force u32)conn->c_laddr; | ||
535 | src.sin_port = (__force u16)htons(0); | ||
536 | |||
537 | /* First, bind to the local address and device. */ | ||
538 | ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); | ||
539 | if (ret) { | ||
540 | rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", | ||
541 | &conn->c_laddr, ret); | ||
542 | rdma_destroy_id(ic->i_cm_id); | ||
543 | ic->i_cm_id = NULL; | ||
544 | goto out; | ||
545 | } | ||
546 | |||
547 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); | ||
548 | ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; | ||
549 | |||
550 | dest.sin_family = AF_INET; | ||
551 | dest.sin_addr.s_addr = (__force u32)conn->c_faddr; | ||
552 | dest.sin_port = (__force u16)htons(RDS_PORT); | ||
553 | |||
554 | ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, | ||
555 | (struct sockaddr *)&dest, | ||
556 | RDS_RDMA_RESOLVE_TIMEOUT_MS); | ||
557 | if (ret) { | ||
558 | rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, | ||
559 | ret); | ||
560 | rdma_destroy_id(ic->i_cm_id); | ||
561 | ic->i_cm_id = NULL; | ||
562 | } | ||
563 | |||
564 | out: | ||
565 | return ret; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * This is so careful about only cleaning up resources that were built up | ||
570 | * so that it can be called at any point during startup. In fact it | ||
571 | * can be called multiple times for a given connection. | ||
572 | */ | ||
573 | void rds_iw_conn_shutdown(struct rds_connection *conn) | ||
574 | { | ||
575 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
576 | int err = 0; | ||
577 | struct ib_qp_attr qp_attr; | ||
578 | |||
579 | rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, | ||
580 | ic->i_pd, ic->i_send_cq, ic->i_recv_cq, | ||
581 | ic->i_cm_id ? ic->i_cm_id->qp : NULL); | ||
582 | |||
583 | if (ic->i_cm_id) { | ||
584 | struct ib_device *dev = ic->i_cm_id->device; | ||
585 | |||
586 | rdsdebug("disconnecting cm %p\n", ic->i_cm_id); | ||
587 | err = rdma_disconnect(ic->i_cm_id); | ||
588 | if (err) { | ||
589 | /* Actually this may happen quite frequently, when | ||
590 | * an outgoing connect raced with an incoming connect. | ||
591 | */ | ||
592 | rdsdebug("rds_iw_conn_shutdown: failed to disconnect," | ||
593 | " cm: %p err %d\n", ic->i_cm_id, err); | ||
594 | } | ||
595 | |||
596 | if (ic->i_cm_id->qp) { | ||
597 | qp_attr.qp_state = IB_QPS_ERR; | ||
598 | ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); | ||
599 | } | ||
600 | |||
601 | wait_event(rds_iw_ring_empty_wait, | ||
602 | rds_iw_ring_empty(&ic->i_send_ring) && | ||
603 | rds_iw_ring_empty(&ic->i_recv_ring)); | ||
604 | |||
605 | if (ic->i_send_hdrs) | ||
606 | ib_dma_free_coherent(dev, | ||
607 | ic->i_send_ring.w_nr * | ||
608 | sizeof(struct rds_header), | ||
609 | ic->i_send_hdrs, | ||
610 | ic->i_send_hdrs_dma); | ||
611 | |||
612 | if (ic->i_recv_hdrs) | ||
613 | ib_dma_free_coherent(dev, | ||
614 | ic->i_recv_ring.w_nr * | ||
615 | sizeof(struct rds_header), | ||
616 | ic->i_recv_hdrs, | ||
617 | ic->i_recv_hdrs_dma); | ||
618 | |||
619 | if (ic->i_ack) | ||
620 | ib_dma_free_coherent(dev, sizeof(struct rds_header), | ||
621 | ic->i_ack, ic->i_ack_dma); | ||
622 | |||
623 | if (ic->i_sends) | ||
624 | rds_iw_send_clear_ring(ic); | ||
625 | if (ic->i_recvs) | ||
626 | rds_iw_recv_clear_ring(ic); | ||
627 | |||
628 | if (ic->i_cm_id->qp) | ||
629 | rdma_destroy_qp(ic->i_cm_id); | ||
630 | if (ic->i_send_cq) | ||
631 | ib_destroy_cq(ic->i_send_cq); | ||
632 | if (ic->i_recv_cq) | ||
633 | ib_destroy_cq(ic->i_recv_cq); | ||
634 | |||
635 | /* | ||
636 | * If associated with an rds_iw_device: | ||
637 | * Move connection back to the nodev list. | ||
638 | * Remove cm_id from the device cm_id list. | ||
639 | */ | ||
640 | if (ic->rds_iwdev) { | ||
641 | |||
642 | spin_lock_irq(&ic->rds_iwdev->spinlock); | ||
643 | BUG_ON(list_empty(&ic->iw_node)); | ||
644 | list_del(&ic->iw_node); | ||
645 | spin_unlock_irq(&ic->rds_iwdev->spinlock); | ||
646 | |||
647 | spin_lock_irq(&iw_nodev_conns_lock); | ||
648 | list_add_tail(&ic->iw_node, &iw_nodev_conns); | ||
649 | spin_unlock_irq(&iw_nodev_conns_lock); | ||
650 | rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); | ||
651 | ic->rds_iwdev = NULL; | ||
652 | } | ||
653 | |||
654 | rdma_destroy_id(ic->i_cm_id); | ||
655 | |||
656 | ic->i_cm_id = NULL; | ||
657 | ic->i_pd = NULL; | ||
658 | ic->i_mr = NULL; | ||
659 | ic->i_send_cq = NULL; | ||
660 | ic->i_recv_cq = NULL; | ||
661 | ic->i_send_hdrs = NULL; | ||
662 | ic->i_recv_hdrs = NULL; | ||
663 | ic->i_ack = NULL; | ||
664 | } | ||
665 | BUG_ON(ic->rds_iwdev); | ||
666 | |||
667 | /* Clear pending transmit */ | ||
668 | if (ic->i_rm) { | ||
669 | rds_message_put(ic->i_rm); | ||
670 | ic->i_rm = NULL; | ||
671 | } | ||
672 | |||
673 | /* Clear the ACK state */ | ||
674 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
675 | rds_iw_set_64bit(&ic->i_ack_next, 0); | ||
676 | ic->i_ack_recv = 0; | ||
677 | |||
678 | /* Clear flow control state */ | ||
679 | ic->i_flowctl = 0; | ||
680 | atomic_set(&ic->i_credits, 0); | ||
681 | |||
682 | rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); | ||
683 | rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); | ||
684 | |||
685 | if (ic->i_iwinc) { | ||
686 | rds_inc_put(&ic->i_iwinc->ii_inc); | ||
687 | ic->i_iwinc = NULL; | ||
688 | } | ||
689 | |||
690 | vfree(ic->i_sends); | ||
691 | ic->i_sends = NULL; | ||
692 | vfree(ic->i_recvs); | ||
693 | ic->i_recvs = NULL; | ||
694 | rdsdebug("shutdown complete\n"); | ||
695 | } | ||
696 | |||
697 | int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) | ||
698 | { | ||
699 | struct rds_iw_connection *ic; | ||
700 | unsigned long flags; | ||
701 | |||
702 | /* XXX too lazy? */ | ||
703 | ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); | ||
704 | if (ic == NULL) | ||
705 | return -ENOMEM; | ||
706 | |||
707 | INIT_LIST_HEAD(&ic->iw_node); | ||
708 | mutex_init(&ic->i_recv_mutex); | ||
709 | |||
710 | /* | ||
711 | * rds_iw_conn_shutdown() waits for these to be emptied so they | ||
712 | * must be initialized before it can be called. | ||
713 | */ | ||
714 | rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); | ||
715 | rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); | ||
716 | |||
717 | ic->conn = conn; | ||
718 | conn->c_transport_data = ic; | ||
719 | |||
720 | spin_lock_irqsave(&iw_nodev_conns_lock, flags); | ||
721 | list_add_tail(&ic->iw_node, &iw_nodev_conns); | ||
722 | spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); | ||
723 | |||
724 | |||
725 | rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); | ||
726 | return 0; | ||
727 | } | ||
728 | |||
729 | void rds_iw_conn_free(void *arg) | ||
730 | { | ||
731 | struct rds_iw_connection *ic = arg; | ||
732 | rdsdebug("ic %p\n", ic); | ||
733 | list_del(&ic->iw_node); | ||
734 | kfree(ic); | ||
735 | } | ||
736 | |||
737 | /* | ||
738 | * An error occurred on the connection | ||
739 | */ | ||
740 | void | ||
741 | __rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) | ||
742 | { | ||
743 | va_list ap; | ||
744 | |||
745 | rds_conn_drop(conn); | ||
746 | |||
747 | va_start(ap, fmt); | ||
748 | vprintk(fmt, ap); | ||
749 | va_end(ap); | ||
750 | } | ||
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c new file mode 100644 index 000000000000..1c02a8f952d0 --- /dev/null +++ b/net/rds/iw_rdma.c | |||
@@ -0,0 +1,888 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | #include "rdma.h" | ||
37 | #include "iw.h" | ||
38 | |||
39 | |||
40 | /* | ||
41 | * This is stored as mr->r_trans_private. | ||
42 | */ | ||
43 | struct rds_iw_mr { | ||
44 | struct rds_iw_device *device; | ||
45 | struct rds_iw_mr_pool *pool; | ||
46 | struct rdma_cm_id *cm_id; | ||
47 | |||
48 | struct ib_mr *mr; | ||
49 | struct ib_fast_reg_page_list *page_list; | ||
50 | |||
51 | struct rds_iw_mapping mapping; | ||
52 | unsigned char remap_count; | ||
53 | }; | ||
54 | |||
55 | /* | ||
56 | * Our own little MR pool | ||
57 | */ | ||
58 | struct rds_iw_mr_pool { | ||
59 | struct rds_iw_device *device; /* back ptr to the device that owns us */ | ||
60 | |||
61 | struct mutex flush_lock; /* serialize fmr invalidate */ | ||
62 | struct work_struct flush_worker; /* flush worker */ | ||
63 | |||
64 | spinlock_t list_lock; /* protect variables below */ | ||
65 | atomic_t item_count; /* total # of MRs */ | ||
66 | atomic_t dirty_count; /* # dirty of MRs */ | ||
67 | struct list_head dirty_list; /* dirty mappings */ | ||
68 | struct list_head clean_list; /* unused & unamapped MRs */ | ||
69 | atomic_t free_pinned; /* memory pinned by free MRs */ | ||
70 | unsigned long max_message_size; /* in pages */ | ||
71 | unsigned long max_items; | ||
72 | unsigned long max_items_soft; | ||
73 | unsigned long max_free_pinned; | ||
74 | int max_pages; | ||
75 | }; | ||
76 | |||
77 | static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); | ||
78 | static void rds_iw_mr_pool_flush_worker(struct work_struct *work); | ||
79 | static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); | ||
80 | static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, | ||
81 | struct rds_iw_mr *ibmr, | ||
82 | struct scatterlist *sg, unsigned int nents); | ||
83 | static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); | ||
84 | static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, | ||
85 | struct list_head *unmap_list, | ||
86 | struct list_head *kill_list); | ||
87 | static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); | ||
88 | |||
89 | static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) | ||
90 | { | ||
91 | struct rds_iw_device *iwdev; | ||
92 | struct rds_iw_cm_id *i_cm_id; | ||
93 | |||
94 | *rds_iwdev = NULL; | ||
95 | *cm_id = NULL; | ||
96 | |||
97 | list_for_each_entry(iwdev, &rds_iw_devices, list) { | ||
98 | spin_lock_irq(&iwdev->spinlock); | ||
99 | list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { | ||
100 | struct sockaddr_in *src_addr, *dst_addr; | ||
101 | |||
102 | src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; | ||
103 | dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; | ||
104 | |||
105 | rdsdebug("local ipaddr = %x port %d, " | ||
106 | "remote ipaddr = %x port %d" | ||
107 | "..looking for %x port %d, " | ||
108 | "remote ipaddr = %x port %d\n", | ||
109 | src_addr->sin_addr.s_addr, | ||
110 | src_addr->sin_port, | ||
111 | dst_addr->sin_addr.s_addr, | ||
112 | dst_addr->sin_port, | ||
113 | rs->rs_bound_addr, | ||
114 | rs->rs_bound_port, | ||
115 | rs->rs_conn_addr, | ||
116 | rs->rs_conn_port); | ||
117 | #ifdef WORKING_TUPLE_DETECTION | ||
118 | if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && | ||
119 | src_addr->sin_port == rs->rs_bound_port && | ||
120 | dst_addr->sin_addr.s_addr == rs->rs_conn_addr && | ||
121 | dst_addr->sin_port == rs->rs_conn_port) { | ||
122 | #else | ||
123 | /* FIXME - needs to compare the local and remote | ||
124 | * ipaddr/port tuple, but the ipaddr is the only | ||
125 | * available infomation in the rds_sock (as the rest are | ||
126 | * zero'ed. It doesn't appear to be properly populated | ||
127 | * during connection setup... | ||
128 | */ | ||
129 | if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { | ||
130 | #endif | ||
131 | spin_unlock_irq(&iwdev->spinlock); | ||
132 | *rds_iwdev = iwdev; | ||
133 | *cm_id = i_cm_id->cm_id; | ||
134 | return 0; | ||
135 | } | ||
136 | } | ||
137 | spin_unlock_irq(&iwdev->spinlock); | ||
138 | } | ||
139 | |||
140 | return 1; | ||
141 | } | ||
142 | |||
143 | static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) | ||
144 | { | ||
145 | struct rds_iw_cm_id *i_cm_id; | ||
146 | |||
147 | i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); | ||
148 | if (!i_cm_id) | ||
149 | return -ENOMEM; | ||
150 | |||
151 | i_cm_id->cm_id = cm_id; | ||
152 | |||
153 | spin_lock_irq(&rds_iwdev->spinlock); | ||
154 | list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); | ||
155 | spin_unlock_irq(&rds_iwdev->spinlock); | ||
156 | |||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) | ||
161 | { | ||
162 | struct rds_iw_cm_id *i_cm_id; | ||
163 | |||
164 | spin_lock_irq(&rds_iwdev->spinlock); | ||
165 | list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { | ||
166 | if (i_cm_id->cm_id == cm_id) { | ||
167 | list_del(&i_cm_id->list); | ||
168 | kfree(i_cm_id); | ||
169 | break; | ||
170 | } | ||
171 | } | ||
172 | spin_unlock_irq(&rds_iwdev->spinlock); | ||
173 | } | ||
174 | |||
175 | |||
176 | int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) | ||
177 | { | ||
178 | struct sockaddr_in *src_addr, *dst_addr; | ||
179 | struct rds_iw_device *rds_iwdev_old; | ||
180 | struct rds_sock rs; | ||
181 | struct rdma_cm_id *pcm_id; | ||
182 | int rc; | ||
183 | |||
184 | src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; | ||
185 | dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; | ||
186 | |||
187 | rs.rs_bound_addr = src_addr->sin_addr.s_addr; | ||
188 | rs.rs_bound_port = src_addr->sin_port; | ||
189 | rs.rs_conn_addr = dst_addr->sin_addr.s_addr; | ||
190 | rs.rs_conn_port = dst_addr->sin_port; | ||
191 | |||
192 | rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); | ||
193 | if (rc) | ||
194 | rds_iw_remove_cm_id(rds_iwdev, cm_id); | ||
195 | |||
196 | return rds_iw_add_cm_id(rds_iwdev, cm_id); | ||
197 | } | ||
198 | |||
199 | int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) | ||
200 | { | ||
201 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
202 | |||
203 | /* conn was previously on the nodev_conns_list */ | ||
204 | spin_lock_irq(&iw_nodev_conns_lock); | ||
205 | BUG_ON(list_empty(&iw_nodev_conns)); | ||
206 | BUG_ON(list_empty(&ic->iw_node)); | ||
207 | list_del(&ic->iw_node); | ||
208 | spin_unlock_irq(&iw_nodev_conns_lock); | ||
209 | |||
210 | spin_lock_irq(&rds_iwdev->spinlock); | ||
211 | list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); | ||
212 | spin_unlock_irq(&rds_iwdev->spinlock); | ||
213 | |||
214 | ic->rds_iwdev = rds_iwdev; | ||
215 | |||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | void rds_iw_remove_nodev_conns(void) | ||
220 | { | ||
221 | struct rds_iw_connection *ic, *_ic; | ||
222 | LIST_HEAD(tmp_list); | ||
223 | |||
224 | /* avoid calling conn_destroy with irqs off */ | ||
225 | spin_lock_irq(&iw_nodev_conns_lock); | ||
226 | list_splice(&iw_nodev_conns, &tmp_list); | ||
227 | INIT_LIST_HEAD(&iw_nodev_conns); | ||
228 | spin_unlock_irq(&iw_nodev_conns_lock); | ||
229 | |||
230 | list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { | ||
231 | if (ic->conn->c_passive) | ||
232 | rds_conn_destroy(ic->conn->c_passive); | ||
233 | rds_conn_destroy(ic->conn); | ||
234 | } | ||
235 | } | ||
236 | |||
237 | void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev) | ||
238 | { | ||
239 | struct rds_iw_connection *ic, *_ic; | ||
240 | LIST_HEAD(tmp_list); | ||
241 | |||
242 | /* avoid calling conn_destroy with irqs off */ | ||
243 | spin_lock_irq(&rds_iwdev->spinlock); | ||
244 | list_splice(&rds_iwdev->conn_list, &tmp_list); | ||
245 | INIT_LIST_HEAD(&rds_iwdev->conn_list); | ||
246 | spin_unlock_irq(&rds_iwdev->spinlock); | ||
247 | |||
248 | list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { | ||
249 | if (ic->conn->c_passive) | ||
250 | rds_conn_destroy(ic->conn->c_passive); | ||
251 | rds_conn_destroy(ic->conn); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, | ||
256 | struct scatterlist *list, unsigned int sg_len) | ||
257 | { | ||
258 | sg->list = list; | ||
259 | sg->len = sg_len; | ||
260 | sg->dma_len = 0; | ||
261 | sg->dma_npages = 0; | ||
262 | sg->bytes = 0; | ||
263 | } | ||
264 | |||
265 | static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, | ||
266 | struct rds_iw_scatterlist *sg, | ||
267 | unsigned int dma_page_shift) | ||
268 | { | ||
269 | struct ib_device *dev = rds_iwdev->dev; | ||
270 | u64 *dma_pages = NULL; | ||
271 | u64 dma_mask; | ||
272 | unsigned int dma_page_size; | ||
273 | int i, j, ret; | ||
274 | |||
275 | dma_page_size = 1 << dma_page_shift; | ||
276 | dma_mask = dma_page_size - 1; | ||
277 | |||
278 | WARN_ON(sg->dma_len); | ||
279 | |||
280 | sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); | ||
281 | if (unlikely(!sg->dma_len)) { | ||
282 | printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); | ||
283 | return ERR_PTR(-EBUSY); | ||
284 | } | ||
285 | |||
286 | sg->bytes = 0; | ||
287 | sg->dma_npages = 0; | ||
288 | |||
289 | ret = -EINVAL; | ||
290 | for (i = 0; i < sg->dma_len; ++i) { | ||
291 | unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); | ||
292 | u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); | ||
293 | u64 end_addr; | ||
294 | |||
295 | sg->bytes += dma_len; | ||
296 | |||
297 | end_addr = dma_addr + dma_len; | ||
298 | if (dma_addr & dma_mask) { | ||
299 | if (i > 0) | ||
300 | goto out_unmap; | ||
301 | dma_addr &= ~dma_mask; | ||
302 | } | ||
303 | if (end_addr & dma_mask) { | ||
304 | if (i < sg->dma_len - 1) | ||
305 | goto out_unmap; | ||
306 | end_addr = (end_addr + dma_mask) & ~dma_mask; | ||
307 | } | ||
308 | |||
309 | sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift; | ||
310 | } | ||
311 | |||
312 | /* Now gather the dma addrs into one list */ | ||
313 | if (sg->dma_npages > fastreg_message_size) | ||
314 | goto out_unmap; | ||
315 | |||
316 | dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC); | ||
317 | if (!dma_pages) { | ||
318 | ret = -ENOMEM; | ||
319 | goto out_unmap; | ||
320 | } | ||
321 | |||
322 | for (i = j = 0; i < sg->dma_len; ++i) { | ||
323 | unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); | ||
324 | u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); | ||
325 | u64 end_addr; | ||
326 | |||
327 | end_addr = dma_addr + dma_len; | ||
328 | dma_addr &= ~dma_mask; | ||
329 | for (; dma_addr < end_addr; dma_addr += dma_page_size) | ||
330 | dma_pages[j++] = dma_addr; | ||
331 | BUG_ON(j > sg->dma_npages); | ||
332 | } | ||
333 | |||
334 | return dma_pages; | ||
335 | |||
336 | out_unmap: | ||
337 | ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); | ||
338 | sg->dma_len = 0; | ||
339 | kfree(dma_pages); | ||
340 | return ERR_PTR(ret); | ||
341 | } | ||
342 | |||
343 | |||
344 | struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) | ||
345 | { | ||
346 | struct rds_iw_mr_pool *pool; | ||
347 | |||
348 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); | ||
349 | if (!pool) { | ||
350 | printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); | ||
351 | return ERR_PTR(-ENOMEM); | ||
352 | } | ||
353 | |||
354 | pool->device = rds_iwdev; | ||
355 | INIT_LIST_HEAD(&pool->dirty_list); | ||
356 | INIT_LIST_HEAD(&pool->clean_list); | ||
357 | mutex_init(&pool->flush_lock); | ||
358 | spin_lock_init(&pool->list_lock); | ||
359 | INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); | ||
360 | |||
361 | pool->max_message_size = fastreg_message_size; | ||
362 | pool->max_items = fastreg_pool_size; | ||
363 | pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; | ||
364 | pool->max_pages = fastreg_message_size; | ||
365 | |||
366 | /* We never allow more than max_items MRs to be allocated. | ||
367 | * When we exceed more than max_items_soft, we start freeing | ||
368 | * items more aggressively. | ||
369 | * Make sure that max_items > max_items_soft > max_items / 2 | ||
370 | */ | ||
371 | pool->max_items_soft = pool->max_items * 3 / 4; | ||
372 | |||
373 | return pool; | ||
374 | } | ||
375 | |||
376 | void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) | ||
377 | { | ||
378 | struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; | ||
379 | |||
380 | iinfo->rdma_mr_max = pool->max_items; | ||
381 | iinfo->rdma_mr_size = pool->max_pages; | ||
382 | } | ||
383 | |||
384 | void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) | ||
385 | { | ||
386 | flush_workqueue(rds_wq); | ||
387 | rds_iw_flush_mr_pool(pool, 1); | ||
388 | BUG_ON(atomic_read(&pool->item_count)); | ||
389 | BUG_ON(atomic_read(&pool->free_pinned)); | ||
390 | kfree(pool); | ||
391 | } | ||
392 | |||
393 | static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) | ||
394 | { | ||
395 | struct rds_iw_mr *ibmr = NULL; | ||
396 | unsigned long flags; | ||
397 | |||
398 | spin_lock_irqsave(&pool->list_lock, flags); | ||
399 | if (!list_empty(&pool->clean_list)) { | ||
400 | ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); | ||
401 | list_del_init(&ibmr->mapping.m_list); | ||
402 | } | ||
403 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
404 | |||
405 | return ibmr; | ||
406 | } | ||
407 | |||
408 | static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) | ||
409 | { | ||
410 | struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; | ||
411 | struct rds_iw_mr *ibmr = NULL; | ||
412 | int err = 0, iter = 0; | ||
413 | |||
414 | while (1) { | ||
415 | ibmr = rds_iw_reuse_fmr(pool); | ||
416 | if (ibmr) | ||
417 | return ibmr; | ||
418 | |||
419 | /* No clean MRs - now we have the choice of either | ||
420 | * allocating a fresh MR up to the limit imposed by the | ||
421 | * driver, or flush any dirty unused MRs. | ||
422 | * We try to avoid stalling in the send path if possible, | ||
423 | * so we allocate as long as we're allowed to. | ||
424 | * | ||
425 | * We're fussy with enforcing the FMR limit, though. If the driver | ||
426 | * tells us we can't use more than N fmrs, we shouldn't start | ||
427 | * arguing with it */ | ||
428 | if (atomic_inc_return(&pool->item_count) <= pool->max_items) | ||
429 | break; | ||
430 | |||
431 | atomic_dec(&pool->item_count); | ||
432 | |||
433 | if (++iter > 2) { | ||
434 | rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); | ||
435 | return ERR_PTR(-EAGAIN); | ||
436 | } | ||
437 | |||
438 | /* We do have some empty MRs. Flush them out. */ | ||
439 | rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); | ||
440 | rds_iw_flush_mr_pool(pool, 0); | ||
441 | } | ||
442 | |||
443 | ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); | ||
444 | if (!ibmr) { | ||
445 | err = -ENOMEM; | ||
446 | goto out_no_cigar; | ||
447 | } | ||
448 | |||
449 | spin_lock_init(&ibmr->mapping.m_lock); | ||
450 | INIT_LIST_HEAD(&ibmr->mapping.m_list); | ||
451 | ibmr->mapping.m_mr = ibmr; | ||
452 | |||
453 | err = rds_iw_init_fastreg(pool, ibmr); | ||
454 | if (err) | ||
455 | goto out_no_cigar; | ||
456 | |||
457 | rds_iw_stats_inc(s_iw_rdma_mr_alloc); | ||
458 | return ibmr; | ||
459 | |||
460 | out_no_cigar: | ||
461 | if (ibmr) { | ||
462 | rds_iw_destroy_fastreg(pool, ibmr); | ||
463 | kfree(ibmr); | ||
464 | } | ||
465 | atomic_dec(&pool->item_count); | ||
466 | return ERR_PTR(err); | ||
467 | } | ||
468 | |||
469 | void rds_iw_sync_mr(void *trans_private, int direction) | ||
470 | { | ||
471 | struct rds_iw_mr *ibmr = trans_private; | ||
472 | struct rds_iw_device *rds_iwdev = ibmr->device; | ||
473 | |||
474 | switch (direction) { | ||
475 | case DMA_FROM_DEVICE: | ||
476 | ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, | ||
477 | ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); | ||
478 | break; | ||
479 | case DMA_TO_DEVICE: | ||
480 | ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, | ||
481 | ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); | ||
482 | break; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all) | ||
487 | { | ||
488 | unsigned int item_count; | ||
489 | |||
490 | item_count = atomic_read(&pool->item_count); | ||
491 | if (free_all) | ||
492 | return item_count; | ||
493 | |||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | /* | ||
498 | * Flush our pool of MRs. | ||
499 | * At a minimum, all currently unused MRs are unmapped. | ||
500 | * If the number of MRs allocated exceeds the limit, we also try | ||
501 | * to free as many MRs as needed to get back to this limit. | ||
502 | */ | ||
503 | static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) | ||
504 | { | ||
505 | struct rds_iw_mr *ibmr, *next; | ||
506 | LIST_HEAD(unmap_list); | ||
507 | LIST_HEAD(kill_list); | ||
508 | unsigned long flags; | ||
509 | unsigned int nfreed = 0, ncleaned = 0, free_goal; | ||
510 | int ret = 0; | ||
511 | |||
512 | rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); | ||
513 | |||
514 | mutex_lock(&pool->flush_lock); | ||
515 | |||
516 | spin_lock_irqsave(&pool->list_lock, flags); | ||
517 | /* Get the list of all mappings to be destroyed */ | ||
518 | list_splice_init(&pool->dirty_list, &unmap_list); | ||
519 | if (free_all) | ||
520 | list_splice_init(&pool->clean_list, &kill_list); | ||
521 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
522 | |||
523 | free_goal = rds_iw_flush_goal(pool, free_all); | ||
524 | |||
525 | /* Batched invalidate of dirty MRs. | ||
526 | * For FMR based MRs, the mappings on the unmap list are | ||
527 | * actually members of an ibmr (ibmr->mapping). They either | ||
528 | * migrate to the kill_list, or have been cleaned and should be | ||
529 | * moved to the clean_list. | ||
530 | * For fastregs, they will be dynamically allocated, and | ||
531 | * will be destroyed by the unmap function. | ||
532 | */ | ||
533 | if (!list_empty(&unmap_list)) { | ||
534 | ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list); | ||
535 | /* If we've been asked to destroy all MRs, move those | ||
536 | * that were simply cleaned to the kill list */ | ||
537 | if (free_all) | ||
538 | list_splice_init(&unmap_list, &kill_list); | ||
539 | } | ||
540 | |||
541 | /* Destroy any MRs that are past their best before date */ | ||
542 | list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { | ||
543 | rds_iw_stats_inc(s_iw_rdma_mr_free); | ||
544 | list_del(&ibmr->mapping.m_list); | ||
545 | rds_iw_destroy_fastreg(pool, ibmr); | ||
546 | kfree(ibmr); | ||
547 | nfreed++; | ||
548 | } | ||
549 | |||
550 | /* Anything that remains are laundered ibmrs, which we can add | ||
551 | * back to the clean list. */ | ||
552 | if (!list_empty(&unmap_list)) { | ||
553 | spin_lock_irqsave(&pool->list_lock, flags); | ||
554 | list_splice(&unmap_list, &pool->clean_list); | ||
555 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
556 | } | ||
557 | |||
558 | atomic_sub(ncleaned, &pool->dirty_count); | ||
559 | atomic_sub(nfreed, &pool->item_count); | ||
560 | |||
561 | mutex_unlock(&pool->flush_lock); | ||
562 | return ret; | ||
563 | } | ||
564 | |||
565 | static void rds_iw_mr_pool_flush_worker(struct work_struct *work) | ||
566 | { | ||
567 | struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); | ||
568 | |||
569 | rds_iw_flush_mr_pool(pool, 0); | ||
570 | } | ||
571 | |||
572 | void rds_iw_free_mr(void *trans_private, int invalidate) | ||
573 | { | ||
574 | struct rds_iw_mr *ibmr = trans_private; | ||
575 | struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; | ||
576 | |||
577 | rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); | ||
578 | if (!pool) | ||
579 | return; | ||
580 | |||
581 | /* Return it to the pool's free list */ | ||
582 | rds_iw_free_fastreg(pool, ibmr); | ||
583 | |||
584 | /* If we've pinned too many pages, request a flush */ | ||
585 | if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned | ||
586 | || atomic_read(&pool->dirty_count) >= pool->max_items / 10) | ||
587 | queue_work(rds_wq, &pool->flush_worker); | ||
588 | |||
589 | if (invalidate) { | ||
590 | if (likely(!in_interrupt())) { | ||
591 | rds_iw_flush_mr_pool(pool, 0); | ||
592 | } else { | ||
593 | /* We get here if the user created a MR marked | ||
594 | * as use_once and invalidate at the same time. */ | ||
595 | queue_work(rds_wq, &pool->flush_worker); | ||
596 | } | ||
597 | } | ||
598 | } | ||
599 | |||
600 | void rds_iw_flush_mrs(void) | ||
601 | { | ||
602 | struct rds_iw_device *rds_iwdev; | ||
603 | |||
604 | list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { | ||
605 | struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; | ||
606 | |||
607 | if (pool) | ||
608 | rds_iw_flush_mr_pool(pool, 0); | ||
609 | } | ||
610 | } | ||
611 | |||
612 | void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, | ||
613 | struct rds_sock *rs, u32 *key_ret) | ||
614 | { | ||
615 | struct rds_iw_device *rds_iwdev; | ||
616 | struct rds_iw_mr *ibmr = NULL; | ||
617 | struct rdma_cm_id *cm_id; | ||
618 | int ret; | ||
619 | |||
620 | ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); | ||
621 | if (ret || !cm_id) { | ||
622 | ret = -ENODEV; | ||
623 | goto out; | ||
624 | } | ||
625 | |||
626 | if (!rds_iwdev->mr_pool) { | ||
627 | ret = -ENODEV; | ||
628 | goto out; | ||
629 | } | ||
630 | |||
631 | ibmr = rds_iw_alloc_mr(rds_iwdev); | ||
632 | if (IS_ERR(ibmr)) | ||
633 | return ibmr; | ||
634 | |||
635 | ibmr->cm_id = cm_id; | ||
636 | ibmr->device = rds_iwdev; | ||
637 | |||
638 | ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents); | ||
639 | if (ret == 0) | ||
640 | *key_ret = ibmr->mr->rkey; | ||
641 | else | ||
642 | printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); | ||
643 | |||
644 | out: | ||
645 | if (ret) { | ||
646 | if (ibmr) | ||
647 | rds_iw_free_mr(ibmr, 0); | ||
648 | ibmr = ERR_PTR(ret); | ||
649 | } | ||
650 | return ibmr; | ||
651 | } | ||
652 | |||
653 | /* | ||
654 | * iWARP fastreg handling | ||
655 | * | ||
656 | * The life cycle of a fastreg registration is a bit different from | ||
657 | * FMRs. | ||
658 | * The idea behind fastreg is to have one MR, to which we bind different | ||
659 | * mappings over time. To avoid stalling on the expensive map and invalidate | ||
660 | * operations, these operations are pipelined on the same send queue on | ||
661 | * which we want to send the message containing the r_key. | ||
662 | * | ||
663 | * This creates a bit of a problem for us, as we do not have the destination | ||
664 | * IP in GET_MR, so the connection must be setup prior to the GET_MR call for | ||
665 | * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit | ||
666 | * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request | ||
667 | * before queuing the SEND. When completions for these arrive, they are | ||
668 | * dispatched to the MR has a bit set showing that RDMa can be performed. | ||
669 | * | ||
670 | * There is another interesting aspect that's related to invalidation. | ||
671 | * The application can request that a mapping is invalidated in FREE_MR. | ||
672 | * The expectation there is that this invalidation step includes ALL | ||
673 | * PREVIOUSLY FREED MRs. | ||
674 | */ | ||
675 | static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, | ||
676 | struct rds_iw_mr *ibmr) | ||
677 | { | ||
678 | struct rds_iw_device *rds_iwdev = pool->device; | ||
679 | struct ib_fast_reg_page_list *page_list = NULL; | ||
680 | struct ib_mr *mr; | ||
681 | int err; | ||
682 | |||
683 | mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); | ||
684 | if (IS_ERR(mr)) { | ||
685 | err = PTR_ERR(mr); | ||
686 | |||
687 | printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); | ||
688 | return err; | ||
689 | } | ||
690 | |||
691 | /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages | ||
692 | * is not filled in. | ||
693 | */ | ||
694 | page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size); | ||
695 | if (IS_ERR(page_list)) { | ||
696 | err = PTR_ERR(page_list); | ||
697 | |||
698 | printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err); | ||
699 | ib_dereg_mr(mr); | ||
700 | return err; | ||
701 | } | ||
702 | |||
703 | ibmr->page_list = page_list; | ||
704 | ibmr->mr = mr; | ||
705 | return 0; | ||
706 | } | ||
707 | |||
708 | static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) | ||
709 | { | ||
710 | struct rds_iw_mr *ibmr = mapping->m_mr; | ||
711 | struct ib_send_wr f_wr, *failed_wr; | ||
712 | int ret; | ||
713 | |||
714 | /* | ||
715 | * Perform a WR for the fast_reg_mr. Each individual page | ||
716 | * in the sg list is added to the fast reg page list and placed | ||
717 | * inside the fast_reg_mr WR. The key used is a rolling 8bit | ||
718 | * counter, which should guarantee uniqueness. | ||
719 | */ | ||
720 | ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); | ||
721 | mapping->m_rkey = ibmr->mr->rkey; | ||
722 | |||
723 | memset(&f_wr, 0, sizeof(f_wr)); | ||
724 | f_wr.wr_id = RDS_IW_FAST_REG_WR_ID; | ||
725 | f_wr.opcode = IB_WR_FAST_REG_MR; | ||
726 | f_wr.wr.fast_reg.length = mapping->m_sg.bytes; | ||
727 | f_wr.wr.fast_reg.rkey = mapping->m_rkey; | ||
728 | f_wr.wr.fast_reg.page_list = ibmr->page_list; | ||
729 | f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len; | ||
730 | f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift; | ||
731 | f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE | | ||
732 | IB_ACCESS_REMOTE_READ | | ||
733 | IB_ACCESS_REMOTE_WRITE; | ||
734 | f_wr.wr.fast_reg.iova_start = 0; | ||
735 | f_wr.send_flags = IB_SEND_SIGNALED; | ||
736 | |||
737 | failed_wr = &f_wr; | ||
738 | ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); | ||
739 | BUG_ON(failed_wr != &f_wr); | ||
740 | if (ret && printk_ratelimit()) | ||
741 | printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", | ||
742 | __func__, __LINE__, ret); | ||
743 | return ret; | ||
744 | } | ||
745 | |||
746 | static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) | ||
747 | { | ||
748 | struct ib_send_wr s_wr, *failed_wr; | ||
749 | int ret = 0; | ||
750 | |||
751 | if (!ibmr->cm_id->qp || !ibmr->mr) | ||
752 | goto out; | ||
753 | |||
754 | memset(&s_wr, 0, sizeof(s_wr)); | ||
755 | s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; | ||
756 | s_wr.opcode = IB_WR_LOCAL_INV; | ||
757 | s_wr.ex.invalidate_rkey = ibmr->mr->rkey; | ||
758 | s_wr.send_flags = IB_SEND_SIGNALED; | ||
759 | |||
760 | failed_wr = &s_wr; | ||
761 | ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); | ||
762 | if (ret && printk_ratelimit()) { | ||
763 | printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", | ||
764 | __func__, __LINE__, ret); | ||
765 | goto out; | ||
766 | } | ||
767 | out: | ||
768 | return ret; | ||
769 | } | ||
770 | |||
771 | static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, | ||
772 | struct rds_iw_mr *ibmr, | ||
773 | struct scatterlist *sg, | ||
774 | unsigned int sg_len) | ||
775 | { | ||
776 | struct rds_iw_device *rds_iwdev = pool->device; | ||
777 | struct rds_iw_mapping *mapping = &ibmr->mapping; | ||
778 | u64 *dma_pages; | ||
779 | int i, ret = 0; | ||
780 | |||
781 | rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); | ||
782 | |||
783 | dma_pages = rds_iw_map_scatterlist(rds_iwdev, | ||
784 | &mapping->m_sg, | ||
785 | rds_iwdev->page_shift); | ||
786 | if (IS_ERR(dma_pages)) { | ||
787 | ret = PTR_ERR(dma_pages); | ||
788 | dma_pages = NULL; | ||
789 | goto out; | ||
790 | } | ||
791 | |||
792 | if (mapping->m_sg.dma_len > pool->max_message_size) { | ||
793 | ret = -EMSGSIZE; | ||
794 | goto out; | ||
795 | } | ||
796 | |||
797 | for (i = 0; i < mapping->m_sg.dma_npages; ++i) | ||
798 | ibmr->page_list->page_list[i] = dma_pages[i]; | ||
799 | |||
800 | ret = rds_iw_rdma_build_fastreg(mapping); | ||
801 | if (ret) | ||
802 | goto out; | ||
803 | |||
804 | rds_iw_stats_inc(s_iw_rdma_mr_used); | ||
805 | |||
806 | out: | ||
807 | kfree(dma_pages); | ||
808 | |||
809 | return ret; | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * "Free" a fastreg MR. | ||
814 | */ | ||
815 | static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, | ||
816 | struct rds_iw_mr *ibmr) | ||
817 | { | ||
818 | unsigned long flags; | ||
819 | int ret; | ||
820 | |||
821 | if (!ibmr->mapping.m_sg.dma_len) | ||
822 | return; | ||
823 | |||
824 | ret = rds_iw_rdma_fastreg_inv(ibmr); | ||
825 | if (ret) | ||
826 | return; | ||
827 | |||
828 | /* Try to post the LOCAL_INV WR to the queue. */ | ||
829 | spin_lock_irqsave(&pool->list_lock, flags); | ||
830 | |||
831 | list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); | ||
832 | atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); | ||
833 | atomic_inc(&pool->dirty_count); | ||
834 | |||
835 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
836 | } | ||
837 | |||
838 | static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, | ||
839 | struct list_head *unmap_list, | ||
840 | struct list_head *kill_list) | ||
841 | { | ||
842 | struct rds_iw_mapping *mapping, *next; | ||
843 | unsigned int ncleaned = 0; | ||
844 | LIST_HEAD(laundered); | ||
845 | |||
846 | /* Batched invalidation of fastreg MRs. | ||
847 | * Why do we do it this way, even though we could pipeline unmap | ||
848 | * and remap? The reason is the application semantics - when the | ||
849 | * application requests an invalidation of MRs, it expects all | ||
850 | * previously released R_Keys to become invalid. | ||
851 | * | ||
852 | * If we implement MR reuse naively, we risk memory corruption | ||
853 | * (this has actually been observed). So the default behavior | ||
854 | * requires that a MR goes through an explicit unmap operation before | ||
855 | * we can reuse it again. | ||
856 | * | ||
857 | * We could probably improve on this a little, by allowing immediate | ||
858 | * reuse of a MR on the same socket (eg you could add small | ||
859 | * cache of unused MRs to strct rds_socket - GET_MR could grab one | ||
860 | * of these without requiring an explicit invalidate). | ||
861 | */ | ||
862 | while (!list_empty(unmap_list)) { | ||
863 | unsigned long flags; | ||
864 | |||
865 | spin_lock_irqsave(&pool->list_lock, flags); | ||
866 | list_for_each_entry_safe(mapping, next, unmap_list, m_list) { | ||
867 | list_move(&mapping->m_list, &laundered); | ||
868 | ncleaned++; | ||
869 | } | ||
870 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
871 | } | ||
872 | |||
873 | /* Move all laundered mappings back to the unmap list. | ||
874 | * We do not kill any WRs right now - it doesn't seem the | ||
875 | * fastreg API has a max_remap limit. */ | ||
876 | list_splice_init(&laundered, unmap_list); | ||
877 | |||
878 | return ncleaned; | ||
879 | } | ||
880 | |||
881 | static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, | ||
882 | struct rds_iw_mr *ibmr) | ||
883 | { | ||
884 | if (ibmr->page_list) | ||
885 | ib_free_fast_reg_page_list(ibmr->page_list); | ||
886 | if (ibmr->mr) | ||
887 | ib_dereg_mr(ibmr->mr); | ||
888 | } | ||
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c new file mode 100644 index 000000000000..a1931f0027a2 --- /dev/null +++ b/net/rds/iw_recv.c | |||
@@ -0,0 +1,869 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/pci.h> | ||
35 | #include <linux/dma-mapping.h> | ||
36 | #include <rdma/rdma_cm.h> | ||
37 | |||
38 | #include "rds.h" | ||
39 | #include "iw.h" | ||
40 | |||
41 | static struct kmem_cache *rds_iw_incoming_slab; | ||
42 | static struct kmem_cache *rds_iw_frag_slab; | ||
43 | static atomic_t rds_iw_allocation = ATOMIC_INIT(0); | ||
44 | |||
45 | static void rds_iw_frag_drop_page(struct rds_page_frag *frag) | ||
46 | { | ||
47 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
48 | __free_page(frag->f_page); | ||
49 | frag->f_page = NULL; | ||
50 | } | ||
51 | |||
52 | static void rds_iw_frag_free(struct rds_page_frag *frag) | ||
53 | { | ||
54 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
55 | BUG_ON(frag->f_page != NULL); | ||
56 | kmem_cache_free(rds_iw_frag_slab, frag); | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * We map a page at a time. Its fragments are posted in order. This | ||
61 | * is called in fragment order as the fragments get send completion events. | ||
62 | * Only the last frag in the page performs the unmapping. | ||
63 | * | ||
64 | * It's OK for ring cleanup to call this in whatever order it likes because | ||
65 | * DMA is not in flight and so we can unmap while other ring entries still | ||
66 | * hold page references in their frags. | ||
67 | */ | ||
68 | static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, | ||
69 | struct rds_iw_recv_work *recv) | ||
70 | { | ||
71 | struct rds_page_frag *frag = recv->r_frag; | ||
72 | |||
73 | rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); | ||
74 | if (frag->f_mapped) | ||
75 | ib_dma_unmap_page(ic->i_cm_id->device, | ||
76 | frag->f_mapped, | ||
77 | RDS_FRAG_SIZE, DMA_FROM_DEVICE); | ||
78 | frag->f_mapped = 0; | ||
79 | } | ||
80 | |||
81 | void rds_iw_recv_init_ring(struct rds_iw_connection *ic) | ||
82 | { | ||
83 | struct rds_iw_recv_work *recv; | ||
84 | u32 i; | ||
85 | |||
86 | for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { | ||
87 | struct ib_sge *sge; | ||
88 | |||
89 | recv->r_iwinc = NULL; | ||
90 | recv->r_frag = NULL; | ||
91 | |||
92 | recv->r_wr.next = NULL; | ||
93 | recv->r_wr.wr_id = i; | ||
94 | recv->r_wr.sg_list = recv->r_sge; | ||
95 | recv->r_wr.num_sge = RDS_IW_RECV_SGE; | ||
96 | |||
97 | sge = rds_iw_data_sge(ic, recv->r_sge); | ||
98 | sge->addr = 0; | ||
99 | sge->length = RDS_FRAG_SIZE; | ||
100 | sge->lkey = 0; | ||
101 | |||
102 | sge = rds_iw_header_sge(ic, recv->r_sge); | ||
103 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | ||
104 | sge->length = sizeof(struct rds_header); | ||
105 | sge->lkey = 0; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, | ||
110 | struct rds_iw_recv_work *recv) | ||
111 | { | ||
112 | if (recv->r_iwinc) { | ||
113 | rds_inc_put(&recv->r_iwinc->ii_inc); | ||
114 | recv->r_iwinc = NULL; | ||
115 | } | ||
116 | if (recv->r_frag) { | ||
117 | rds_iw_recv_unmap_page(ic, recv); | ||
118 | if (recv->r_frag->f_page) | ||
119 | rds_iw_frag_drop_page(recv->r_frag); | ||
120 | rds_iw_frag_free(recv->r_frag); | ||
121 | recv->r_frag = NULL; | ||
122 | } | ||
123 | } | ||
124 | |||
125 | void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) | ||
126 | { | ||
127 | u32 i; | ||
128 | |||
129 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) | ||
130 | rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); | ||
131 | |||
132 | if (ic->i_frag.f_page) | ||
133 | rds_iw_frag_drop_page(&ic->i_frag); | ||
134 | } | ||
135 | |||
136 | static int rds_iw_recv_refill_one(struct rds_connection *conn, | ||
137 | struct rds_iw_recv_work *recv, | ||
138 | gfp_t kptr_gfp, gfp_t page_gfp) | ||
139 | { | ||
140 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
141 | dma_addr_t dma_addr; | ||
142 | struct ib_sge *sge; | ||
143 | int ret = -ENOMEM; | ||
144 | |||
145 | if (recv->r_iwinc == NULL) { | ||
146 | if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) { | ||
147 | rds_iw_stats_inc(s_iw_rx_alloc_limit); | ||
148 | goto out; | ||
149 | } | ||
150 | recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, | ||
151 | kptr_gfp); | ||
152 | if (recv->r_iwinc == NULL) | ||
153 | goto out; | ||
154 | atomic_inc(&rds_iw_allocation); | ||
155 | INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); | ||
156 | rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); | ||
157 | } | ||
158 | |||
159 | if (recv->r_frag == NULL) { | ||
160 | recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); | ||
161 | if (recv->r_frag == NULL) | ||
162 | goto out; | ||
163 | INIT_LIST_HEAD(&recv->r_frag->f_item); | ||
164 | recv->r_frag->f_page = NULL; | ||
165 | } | ||
166 | |||
167 | if (ic->i_frag.f_page == NULL) { | ||
168 | ic->i_frag.f_page = alloc_page(page_gfp); | ||
169 | if (ic->i_frag.f_page == NULL) | ||
170 | goto out; | ||
171 | ic->i_frag.f_offset = 0; | ||
172 | } | ||
173 | |||
174 | dma_addr = ib_dma_map_page(ic->i_cm_id->device, | ||
175 | ic->i_frag.f_page, | ||
176 | ic->i_frag.f_offset, | ||
177 | RDS_FRAG_SIZE, | ||
178 | DMA_FROM_DEVICE); | ||
179 | if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) | ||
180 | goto out; | ||
181 | |||
182 | /* | ||
183 | * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() | ||
184 | * must be called on this recv. This happens as completions hit | ||
185 | * in order or on connection shutdown. | ||
186 | */ | ||
187 | recv->r_frag->f_page = ic->i_frag.f_page; | ||
188 | recv->r_frag->f_offset = ic->i_frag.f_offset; | ||
189 | recv->r_frag->f_mapped = dma_addr; | ||
190 | |||
191 | sge = rds_iw_data_sge(ic, recv->r_sge); | ||
192 | sge->addr = dma_addr; | ||
193 | sge->length = RDS_FRAG_SIZE; | ||
194 | |||
195 | sge = rds_iw_header_sge(ic, recv->r_sge); | ||
196 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); | ||
197 | sge->length = sizeof(struct rds_header); | ||
198 | |||
199 | get_page(recv->r_frag->f_page); | ||
200 | |||
201 | if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { | ||
202 | ic->i_frag.f_offset += RDS_FRAG_SIZE; | ||
203 | } else { | ||
204 | put_page(ic->i_frag.f_page); | ||
205 | ic->i_frag.f_page = NULL; | ||
206 | ic->i_frag.f_offset = 0; | ||
207 | } | ||
208 | |||
209 | ret = 0; | ||
210 | out: | ||
211 | return ret; | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * This tries to allocate and post unused work requests after making sure that | ||
216 | * they have all the allocations they need to queue received fragments into | ||
217 | * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc | ||
218 | * pairs don't go unmatched. | ||
219 | * | ||
220 | * -1 is returned if posting fails due to temporary resource exhaustion. | ||
221 | */ | ||
222 | int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | ||
223 | gfp_t page_gfp, int prefill) | ||
224 | { | ||
225 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
226 | struct rds_iw_recv_work *recv; | ||
227 | struct ib_recv_wr *failed_wr; | ||
228 | unsigned int posted = 0; | ||
229 | int ret = 0; | ||
230 | u32 pos; | ||
231 | |||
232 | while ((prefill || rds_conn_up(conn)) | ||
233 | && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { | ||
234 | if (pos >= ic->i_recv_ring.w_nr) { | ||
235 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", | ||
236 | pos); | ||
237 | ret = -EINVAL; | ||
238 | break; | ||
239 | } | ||
240 | |||
241 | recv = &ic->i_recvs[pos]; | ||
242 | ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); | ||
243 | if (ret) { | ||
244 | ret = -1; | ||
245 | break; | ||
246 | } | ||
247 | |||
248 | /* XXX when can this fail? */ | ||
249 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); | ||
250 | rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, | ||
251 | recv->r_iwinc, recv->r_frag->f_page, | ||
252 | (long) recv->r_frag->f_mapped, ret); | ||
253 | if (ret) { | ||
254 | rds_iw_conn_error(conn, "recv post on " | ||
255 | "%pI4 returned %d, disconnecting and " | ||
256 | "reconnecting\n", &conn->c_faddr, | ||
257 | ret); | ||
258 | ret = -1; | ||
259 | break; | ||
260 | } | ||
261 | |||
262 | posted++; | ||
263 | } | ||
264 | |||
265 | /* We're doing flow control - update the window. */ | ||
266 | if (ic->i_flowctl && posted) | ||
267 | rds_iw_advertise_credits(conn, posted); | ||
268 | |||
269 | if (ret) | ||
270 | rds_iw_ring_unalloc(&ic->i_recv_ring, 1); | ||
271 | return ret; | ||
272 | } | ||
273 | |||
274 | void rds_iw_inc_purge(struct rds_incoming *inc) | ||
275 | { | ||
276 | struct rds_iw_incoming *iwinc; | ||
277 | struct rds_page_frag *frag; | ||
278 | struct rds_page_frag *pos; | ||
279 | |||
280 | iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); | ||
281 | rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); | ||
282 | |||
283 | list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { | ||
284 | list_del_init(&frag->f_item); | ||
285 | rds_iw_frag_drop_page(frag); | ||
286 | rds_iw_frag_free(frag); | ||
287 | } | ||
288 | } | ||
289 | |||
290 | void rds_iw_inc_free(struct rds_incoming *inc) | ||
291 | { | ||
292 | struct rds_iw_incoming *iwinc; | ||
293 | |||
294 | iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); | ||
295 | |||
296 | rds_iw_inc_purge(inc); | ||
297 | rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); | ||
298 | BUG_ON(!list_empty(&iwinc->ii_frags)); | ||
299 | kmem_cache_free(rds_iw_incoming_slab, iwinc); | ||
300 | atomic_dec(&rds_iw_allocation); | ||
301 | BUG_ON(atomic_read(&rds_iw_allocation) < 0); | ||
302 | } | ||
303 | |||
304 | int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | ||
305 | size_t size) | ||
306 | { | ||
307 | struct rds_iw_incoming *iwinc; | ||
308 | struct rds_page_frag *frag; | ||
309 | struct iovec *iov = first_iov; | ||
310 | unsigned long to_copy; | ||
311 | unsigned long frag_off = 0; | ||
312 | unsigned long iov_off = 0; | ||
313 | int copied = 0; | ||
314 | int ret; | ||
315 | u32 len; | ||
316 | |||
317 | iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); | ||
318 | frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); | ||
319 | len = be32_to_cpu(inc->i_hdr.h_len); | ||
320 | |||
321 | while (copied < size && copied < len) { | ||
322 | if (frag_off == RDS_FRAG_SIZE) { | ||
323 | frag = list_entry(frag->f_item.next, | ||
324 | struct rds_page_frag, f_item); | ||
325 | frag_off = 0; | ||
326 | } | ||
327 | while (iov_off == iov->iov_len) { | ||
328 | iov_off = 0; | ||
329 | iov++; | ||
330 | } | ||
331 | |||
332 | to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); | ||
333 | to_copy = min_t(size_t, to_copy, size - copied); | ||
334 | to_copy = min_t(unsigned long, to_copy, len - copied); | ||
335 | |||
336 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " | ||
337 | "[%p, %lu] + %lu\n", | ||
338 | to_copy, iov->iov_base, iov->iov_len, iov_off, | ||
339 | frag->f_page, frag->f_offset, frag_off); | ||
340 | |||
341 | /* XXX needs + offset for multiple recvs per page */ | ||
342 | ret = rds_page_copy_to_user(frag->f_page, | ||
343 | frag->f_offset + frag_off, | ||
344 | iov->iov_base + iov_off, | ||
345 | to_copy); | ||
346 | if (ret) { | ||
347 | copied = ret; | ||
348 | break; | ||
349 | } | ||
350 | |||
351 | iov_off += to_copy; | ||
352 | frag_off += to_copy; | ||
353 | copied += to_copy; | ||
354 | } | ||
355 | |||
356 | return copied; | ||
357 | } | ||
358 | |||
359 | /* ic starts out kzalloc()ed */ | ||
360 | void rds_iw_recv_init_ack(struct rds_iw_connection *ic) | ||
361 | { | ||
362 | struct ib_send_wr *wr = &ic->i_ack_wr; | ||
363 | struct ib_sge *sge = &ic->i_ack_sge; | ||
364 | |||
365 | sge->addr = ic->i_ack_dma; | ||
366 | sge->length = sizeof(struct rds_header); | ||
367 | sge->lkey = rds_iw_local_dma_lkey(ic); | ||
368 | |||
369 | wr->sg_list = sge; | ||
370 | wr->num_sge = 1; | ||
371 | wr->opcode = IB_WR_SEND; | ||
372 | wr->wr_id = RDS_IW_ACK_WR_ID; | ||
373 | wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
374 | } | ||
375 | |||
376 | /* | ||
377 | * You'd think that with reliable IB connections you wouldn't need to ack | ||
378 | * messages that have been received. The problem is that IB hardware generates | ||
379 | * an ack message before it has DMAed the message into memory. This creates a | ||
380 | * potential message loss if the HCA is disabled for any reason between when it | ||
381 | * sends the ack and before the message is DMAed and processed. This is only a | ||
382 | * potential issue if another HCA is available for fail-over. | ||
383 | * | ||
384 | * When the remote host receives our ack they'll free the sent message from | ||
385 | * their send queue. To decrease the latency of this we always send an ack | ||
386 | * immediately after we've received messages. | ||
387 | * | ||
388 | * For simplicity, we only have one ack in flight at a time. This puts | ||
389 | * pressure on senders to have deep enough send queues to absorb the latency of | ||
390 | * a single ack frame being in flight. This might not be good enough. | ||
391 | * | ||
392 | * This is implemented by have a long-lived send_wr and sge which point to a | ||
393 | * statically allocated ack frame. This ack wr does not fall under the ring | ||
394 | * accounting that the tx and rx wrs do. The QP attribute specifically makes | ||
395 | * room for it beyond the ring size. Send completion notices its special | ||
396 | * wr_id and avoids working with the ring in that case. | ||
397 | */ | ||
398 | static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, | ||
399 | int ack_required) | ||
400 | { | ||
401 | rds_iw_set_64bit(&ic->i_ack_next, seq); | ||
402 | if (ack_required) { | ||
403 | smp_mb__before_clear_bit(); | ||
404 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
405 | } | ||
406 | } | ||
407 | |||
408 | static u64 rds_iw_get_ack(struct rds_iw_connection *ic) | ||
409 | { | ||
410 | clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
411 | smp_mb__after_clear_bit(); | ||
412 | |||
413 | return ic->i_ack_next; | ||
414 | } | ||
415 | |||
416 | static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) | ||
417 | { | ||
418 | struct rds_header *hdr = ic->i_ack; | ||
419 | struct ib_send_wr *failed_wr; | ||
420 | u64 seq; | ||
421 | int ret; | ||
422 | |||
423 | seq = rds_iw_get_ack(ic); | ||
424 | |||
425 | rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); | ||
426 | rds_message_populate_header(hdr, 0, 0, 0); | ||
427 | hdr->h_ack = cpu_to_be64(seq); | ||
428 | hdr->h_credit = adv_credits; | ||
429 | rds_message_make_checksum(hdr); | ||
430 | ic->i_ack_queued = jiffies; | ||
431 | |||
432 | ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); | ||
433 | if (unlikely(ret)) { | ||
434 | /* Failed to send. Release the WR, and | ||
435 | * force another ACK. | ||
436 | */ | ||
437 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
438 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
439 | |||
440 | rds_iw_stats_inc(s_iw_ack_send_failure); | ||
441 | /* Need to finesse this later. */ | ||
442 | BUG(); | ||
443 | } else | ||
444 | rds_iw_stats_inc(s_iw_ack_sent); | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * There are 3 ways of getting acknowledgements to the peer: | ||
449 | * 1. We call rds_iw_attempt_ack from the recv completion handler | ||
450 | * to send an ACK-only frame. | ||
451 | * However, there can be only one such frame in the send queue | ||
452 | * at any time, so we may have to postpone it. | ||
453 | * 2. When another (data) packet is transmitted while there's | ||
454 | * an ACK in the queue, we piggyback the ACK sequence number | ||
455 | * on the data packet. | ||
456 | * 3. If the ACK WR is done sending, we get called from the | ||
457 | * send queue completion handler, and check whether there's | ||
458 | * another ACK pending (postponed because the WR was on the | ||
459 | * queue). If so, we transmit it. | ||
460 | * | ||
461 | * We maintain 2 variables: | ||
462 | * - i_ack_flags, which keeps track of whether the ACK WR | ||
463 | * is currently in the send queue or not (IB_ACK_IN_FLIGHT) | ||
464 | * - i_ack_next, which is the last sequence number we received | ||
465 | * | ||
466 | * Potentially, send queue and receive queue handlers can run concurrently. | ||
467 | * | ||
468 | * Reconnecting complicates this picture just slightly. When we | ||
469 | * reconnect, we may be seeing duplicate packets. The peer | ||
470 | * is retransmitting them, because it hasn't seen an ACK for | ||
471 | * them. It is important that we ACK these. | ||
472 | * | ||
473 | * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with | ||
474 | * this flag set *MUST* be acknowledged immediately. | ||
475 | */ | ||
476 | |||
477 | /* | ||
478 | * When we get here, we're called from the recv queue handler. | ||
479 | * Check whether we ought to transmit an ACK. | ||
480 | */ | ||
481 | void rds_iw_attempt_ack(struct rds_iw_connection *ic) | ||
482 | { | ||
483 | unsigned int adv_credits; | ||
484 | |||
485 | if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) | ||
486 | return; | ||
487 | |||
488 | if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { | ||
489 | rds_iw_stats_inc(s_iw_ack_send_delayed); | ||
490 | return; | ||
491 | } | ||
492 | |||
493 | /* Can we get a send credit? */ | ||
494 | if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) { | ||
495 | rds_iw_stats_inc(s_iw_tx_throttle); | ||
496 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
497 | return; | ||
498 | } | ||
499 | |||
500 | clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
501 | rds_iw_send_ack(ic, adv_credits); | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * We get here from the send completion handler, when the | ||
506 | * adapter tells us the ACK frame was sent. | ||
507 | */ | ||
508 | void rds_iw_ack_send_complete(struct rds_iw_connection *ic) | ||
509 | { | ||
510 | clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); | ||
511 | rds_iw_attempt_ack(ic); | ||
512 | } | ||
513 | |||
514 | /* | ||
515 | * This is called by the regular xmit code when it wants to piggyback | ||
516 | * an ACK on an outgoing frame. | ||
517 | */ | ||
518 | u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) | ||
519 | { | ||
520 | if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) | ||
521 | rds_iw_stats_inc(s_iw_ack_send_piggybacked); | ||
522 | return rds_iw_get_ack(ic); | ||
523 | } | ||
524 | |||
525 | /* | ||
526 | * It's kind of lame that we're copying from the posted receive pages into | ||
527 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into | ||
528 | * them. But receiving new congestion bitmaps should be a *rare* event, so | ||
529 | * hopefully we won't need to invest that complexity in making it more | ||
530 | * efficient. By copying we can share a simpler core with TCP which has to | ||
531 | * copy. | ||
532 | */ | ||
533 | static void rds_iw_cong_recv(struct rds_connection *conn, | ||
534 | struct rds_iw_incoming *iwinc) | ||
535 | { | ||
536 | struct rds_cong_map *map; | ||
537 | unsigned int map_off; | ||
538 | unsigned int map_page; | ||
539 | struct rds_page_frag *frag; | ||
540 | unsigned long frag_off; | ||
541 | unsigned long to_copy; | ||
542 | unsigned long copied; | ||
543 | uint64_t uncongested = 0; | ||
544 | void *addr; | ||
545 | |||
546 | /* catch completely corrupt packets */ | ||
547 | if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) | ||
548 | return; | ||
549 | |||
550 | map = conn->c_fcong; | ||
551 | map_page = 0; | ||
552 | map_off = 0; | ||
553 | |||
554 | frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); | ||
555 | frag_off = 0; | ||
556 | |||
557 | copied = 0; | ||
558 | |||
559 | while (copied < RDS_CONG_MAP_BYTES) { | ||
560 | uint64_t *src, *dst; | ||
561 | unsigned int k; | ||
562 | |||
563 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); | ||
564 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ | ||
565 | |||
566 | addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); | ||
567 | |||
568 | src = addr + frag_off; | ||
569 | dst = (void *)map->m_page_addrs[map_page] + map_off; | ||
570 | for (k = 0; k < to_copy; k += 8) { | ||
571 | /* Record ports that became uncongested, ie | ||
572 | * bits that changed from 0 to 1. */ | ||
573 | uncongested |= ~(*src) & *dst; | ||
574 | *dst++ = *src++; | ||
575 | } | ||
576 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
577 | |||
578 | copied += to_copy; | ||
579 | |||
580 | map_off += to_copy; | ||
581 | if (map_off == PAGE_SIZE) { | ||
582 | map_off = 0; | ||
583 | map_page++; | ||
584 | } | ||
585 | |||
586 | frag_off += to_copy; | ||
587 | if (frag_off == RDS_FRAG_SIZE) { | ||
588 | frag = list_entry(frag->f_item.next, | ||
589 | struct rds_page_frag, f_item); | ||
590 | frag_off = 0; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | /* the congestion map is in little endian order */ | ||
595 | uncongested = le64_to_cpu(uncongested); | ||
596 | |||
597 | rds_cong_map_updated(map, uncongested); | ||
598 | } | ||
599 | |||
600 | /* | ||
601 | * Rings are posted with all the allocations they'll need to queue the | ||
602 | * incoming message to the receiving socket so this can't fail. | ||
603 | * All fragments start with a header, so we can make sure we're not receiving | ||
604 | * garbage, and we can tell a small 8 byte fragment from an ACK frame. | ||
605 | */ | ||
606 | struct rds_iw_ack_state { | ||
607 | u64 ack_next; | ||
608 | u64 ack_recv; | ||
609 | unsigned int ack_required:1; | ||
610 | unsigned int ack_next_valid:1; | ||
611 | unsigned int ack_recv_valid:1; | ||
612 | }; | ||
613 | |||
614 | static void rds_iw_process_recv(struct rds_connection *conn, | ||
615 | struct rds_iw_recv_work *recv, u32 byte_len, | ||
616 | struct rds_iw_ack_state *state) | ||
617 | { | ||
618 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
619 | struct rds_iw_incoming *iwinc = ic->i_iwinc; | ||
620 | struct rds_header *ihdr, *hdr; | ||
621 | |||
622 | /* XXX shut down the connection if port 0,0 are seen? */ | ||
623 | |||
624 | rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, | ||
625 | byte_len); | ||
626 | |||
627 | if (byte_len < sizeof(struct rds_header)) { | ||
628 | rds_iw_conn_error(conn, "incoming message " | ||
629 | "from %pI4 didn't inclue a " | ||
630 | "header, disconnecting and " | ||
631 | "reconnecting\n", | ||
632 | &conn->c_faddr); | ||
633 | return; | ||
634 | } | ||
635 | byte_len -= sizeof(struct rds_header); | ||
636 | |||
637 | ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; | ||
638 | |||
639 | /* Validate the checksum. */ | ||
640 | if (!rds_message_verify_checksum(ihdr)) { | ||
641 | rds_iw_conn_error(conn, "incoming message " | ||
642 | "from %pI4 has corrupted header - " | ||
643 | "forcing a reconnect\n", | ||
644 | &conn->c_faddr); | ||
645 | rds_stats_inc(s_recv_drop_bad_checksum); | ||
646 | return; | ||
647 | } | ||
648 | |||
649 | /* Process the ACK sequence which comes with every packet */ | ||
650 | state->ack_recv = be64_to_cpu(ihdr->h_ack); | ||
651 | state->ack_recv_valid = 1; | ||
652 | |||
653 | /* Process the credits update if there was one */ | ||
654 | if (ihdr->h_credit) | ||
655 | rds_iw_send_add_credits(conn, ihdr->h_credit); | ||
656 | |||
657 | if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { | ||
658 | /* This is an ACK-only packet. The fact that it gets | ||
659 | * special treatment here is that historically, ACKs | ||
660 | * were rather special beasts. | ||
661 | */ | ||
662 | rds_iw_stats_inc(s_iw_ack_received); | ||
663 | |||
664 | /* | ||
665 | * Usually the frags make their way on to incs and are then freed as | ||
666 | * the inc is freed. We don't go that route, so we have to drop the | ||
667 | * page ref ourselves. We can't just leave the page on the recv | ||
668 | * because that confuses the dma mapping of pages and each recv's use | ||
669 | * of a partial page. We can leave the frag, though, it will be | ||
670 | * reused. | ||
671 | * | ||
672 | * FIXME: Fold this into the code path below. | ||
673 | */ | ||
674 | rds_iw_frag_drop_page(recv->r_frag); | ||
675 | return; | ||
676 | } | ||
677 | |||
678 | /* | ||
679 | * If we don't already have an inc on the connection then this | ||
680 | * fragment has a header and starts a message.. copy its header | ||
681 | * into the inc and save the inc so we can hang upcoming fragments | ||
682 | * off its list. | ||
683 | */ | ||
684 | if (iwinc == NULL) { | ||
685 | iwinc = recv->r_iwinc; | ||
686 | recv->r_iwinc = NULL; | ||
687 | ic->i_iwinc = iwinc; | ||
688 | |||
689 | hdr = &iwinc->ii_inc.i_hdr; | ||
690 | memcpy(hdr, ihdr, sizeof(*hdr)); | ||
691 | ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); | ||
692 | |||
693 | rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, | ||
694 | ic->i_recv_data_rem, hdr->h_flags); | ||
695 | } else { | ||
696 | hdr = &iwinc->ii_inc.i_hdr; | ||
697 | /* We can't just use memcmp here; fragments of a | ||
698 | * single message may carry different ACKs */ | ||
699 | if (hdr->h_sequence != ihdr->h_sequence | ||
700 | || hdr->h_len != ihdr->h_len | ||
701 | || hdr->h_sport != ihdr->h_sport | ||
702 | || hdr->h_dport != ihdr->h_dport) { | ||
703 | rds_iw_conn_error(conn, | ||
704 | "fragment header mismatch; forcing reconnect\n"); | ||
705 | return; | ||
706 | } | ||
707 | } | ||
708 | |||
709 | list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); | ||
710 | recv->r_frag = NULL; | ||
711 | |||
712 | if (ic->i_recv_data_rem > RDS_FRAG_SIZE) | ||
713 | ic->i_recv_data_rem -= RDS_FRAG_SIZE; | ||
714 | else { | ||
715 | ic->i_recv_data_rem = 0; | ||
716 | ic->i_iwinc = NULL; | ||
717 | |||
718 | if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) | ||
719 | rds_iw_cong_recv(conn, iwinc); | ||
720 | else { | ||
721 | rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, | ||
722 | &iwinc->ii_inc, GFP_ATOMIC, | ||
723 | KM_SOFTIRQ0); | ||
724 | state->ack_next = be64_to_cpu(hdr->h_sequence); | ||
725 | state->ack_next_valid = 1; | ||
726 | } | ||
727 | |||
728 | /* Evaluate the ACK_REQUIRED flag *after* we received | ||
729 | * the complete frame, and after bumping the next_rx | ||
730 | * sequence. */ | ||
731 | if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { | ||
732 | rds_stats_inc(s_recv_ack_required); | ||
733 | state->ack_required = 1; | ||
734 | } | ||
735 | |||
736 | rds_inc_put(&iwinc->ii_inc); | ||
737 | } | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Plucking the oldest entry from the ring can be done concurrently with | ||
742 | * the thread refilling the ring. Each ring operation is protected by | ||
743 | * spinlocks and the transient state of refilling doesn't change the | ||
744 | * recording of which entry is oldest. | ||
745 | * | ||
746 | * This relies on IB only calling one cq comp_handler for each cq so that | ||
747 | * there will only be one caller of rds_recv_incoming() per RDS connection. | ||
748 | */ | ||
749 | void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) | ||
750 | { | ||
751 | struct rds_connection *conn = context; | ||
752 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
753 | struct ib_wc wc; | ||
754 | struct rds_iw_ack_state state = { 0, }; | ||
755 | struct rds_iw_recv_work *recv; | ||
756 | |||
757 | rdsdebug("conn %p cq %p\n", conn, cq); | ||
758 | |||
759 | rds_iw_stats_inc(s_iw_rx_cq_call); | ||
760 | |||
761 | ib_req_notify_cq(cq, IB_CQ_SOLICITED); | ||
762 | |||
763 | while (ib_poll_cq(cq, 1, &wc) > 0) { | ||
764 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | ||
765 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | ||
766 | be32_to_cpu(wc.ex.imm_data)); | ||
767 | rds_iw_stats_inc(s_iw_rx_cq_event); | ||
768 | |||
769 | recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; | ||
770 | |||
771 | rds_iw_recv_unmap_page(ic, recv); | ||
772 | |||
773 | /* | ||
774 | * Also process recvs in connecting state because it is possible | ||
775 | * to get a recv completion _before_ the rdmacm ESTABLISHED | ||
776 | * event is processed. | ||
777 | */ | ||
778 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) { | ||
779 | /* We expect errors as the qp is drained during shutdown */ | ||
780 | if (wc.status == IB_WC_SUCCESS) { | ||
781 | rds_iw_process_recv(conn, recv, wc.byte_len, &state); | ||
782 | } else { | ||
783 | rds_iw_conn_error(conn, "recv completion on " | ||
784 | "%pI4 had status %u, disconnecting and " | ||
785 | "reconnecting\n", &conn->c_faddr, | ||
786 | wc.status); | ||
787 | } | ||
788 | } | ||
789 | |||
790 | rds_iw_ring_free(&ic->i_recv_ring, 1); | ||
791 | } | ||
792 | |||
793 | if (state.ack_next_valid) | ||
794 | rds_iw_set_ack(ic, state.ack_next, state.ack_required); | ||
795 | if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { | ||
796 | rds_send_drop_acked(conn, state.ack_recv, NULL); | ||
797 | ic->i_ack_recv = state.ack_recv; | ||
798 | } | ||
799 | if (rds_conn_up(conn)) | ||
800 | rds_iw_attempt_ack(ic); | ||
801 | |||
802 | /* If we ever end up with a really empty receive ring, we're | ||
803 | * in deep trouble, as the sender will definitely see RNR | ||
804 | * timeouts. */ | ||
805 | if (rds_iw_ring_empty(&ic->i_recv_ring)) | ||
806 | rds_iw_stats_inc(s_iw_rx_ring_empty); | ||
807 | |||
808 | /* | ||
809 | * If the ring is running low, then schedule the thread to refill. | ||
810 | */ | ||
811 | if (rds_iw_ring_low(&ic->i_recv_ring)) | ||
812 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | ||
813 | } | ||
814 | |||
815 | int rds_iw_recv(struct rds_connection *conn) | ||
816 | { | ||
817 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
818 | int ret = 0; | ||
819 | |||
820 | rdsdebug("conn %p\n", conn); | ||
821 | |||
822 | /* | ||
823 | * If we get a temporary posting failure in this context then | ||
824 | * we're really low and we want the caller to back off for a bit. | ||
825 | */ | ||
826 | mutex_lock(&ic->i_recv_mutex); | ||
827 | if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) | ||
828 | ret = -ENOMEM; | ||
829 | else | ||
830 | rds_iw_stats_inc(s_iw_rx_refill_from_thread); | ||
831 | mutex_unlock(&ic->i_recv_mutex); | ||
832 | |||
833 | if (rds_conn_up(conn)) | ||
834 | rds_iw_attempt_ack(ic); | ||
835 | |||
836 | return ret; | ||
837 | } | ||
838 | |||
839 | int __init rds_iw_recv_init(void) | ||
840 | { | ||
841 | struct sysinfo si; | ||
842 | int ret = -ENOMEM; | ||
843 | |||
844 | /* Default to 30% of all available RAM for recv memory */ | ||
845 | si_meminfo(&si); | ||
846 | rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; | ||
847 | |||
848 | rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", | ||
849 | sizeof(struct rds_iw_incoming), | ||
850 | 0, 0, NULL); | ||
851 | if (rds_iw_incoming_slab == NULL) | ||
852 | goto out; | ||
853 | |||
854 | rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", | ||
855 | sizeof(struct rds_page_frag), | ||
856 | 0, 0, NULL); | ||
857 | if (rds_iw_frag_slab == NULL) | ||
858 | kmem_cache_destroy(rds_iw_incoming_slab); | ||
859 | else | ||
860 | ret = 0; | ||
861 | out: | ||
862 | return ret; | ||
863 | } | ||
864 | |||
865 | void rds_iw_recv_exit(void) | ||
866 | { | ||
867 | kmem_cache_destroy(rds_iw_incoming_slab); | ||
868 | kmem_cache_destroy(rds_iw_frag_slab); | ||
869 | } | ||
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c new file mode 100644 index 000000000000..d422d4b5deef --- /dev/null +++ b/net/rds/iw_ring.c | |||
@@ -0,0 +1,169 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | #include "iw.h" | ||
37 | |||
38 | /* | ||
39 | * Locking for IB rings. | ||
40 | * We assume that allocation is always protected by a mutex | ||
41 | * in the caller (this is a valid assumption for the current | ||
42 | * implementation). | ||
43 | * | ||
44 | * Freeing always happens in an interrupt, and hence only | ||
45 | * races with allocations, but not with other free()s. | ||
46 | * | ||
47 | * The interaction between allocation and freeing is that | ||
48 | * the alloc code has to determine the number of free entries. | ||
49 | * To this end, we maintain two counters; an allocation counter | ||
50 | * and a free counter. Both are allowed to run freely, and wrap | ||
51 | * around. | ||
52 | * The number of used entries is always (alloc_ctr - free_ctr) % NR. | ||
53 | * | ||
54 | * The current implementation makes free_ctr atomic. When the | ||
55 | * caller finds an allocation fails, it should set an "alloc fail" | ||
56 | * bit and retry the allocation. The "alloc fail" bit essentially tells | ||
57 | * the CQ completion handlers to wake it up after freeing some | ||
58 | * more entries. | ||
59 | */ | ||
60 | |||
61 | /* | ||
62 | * This only happens on shutdown. | ||
63 | */ | ||
64 | DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); | ||
65 | |||
66 | void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) | ||
67 | { | ||
68 | memset(ring, 0, sizeof(*ring)); | ||
69 | ring->w_nr = nr; | ||
70 | rdsdebug("ring %p nr %u\n", ring, ring->w_nr); | ||
71 | } | ||
72 | |||
73 | static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) | ||
74 | { | ||
75 | u32 diff; | ||
76 | |||
77 | /* This assumes that atomic_t has at least as many bits as u32 */ | ||
78 | diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); | ||
79 | BUG_ON(diff > ring->w_nr); | ||
80 | |||
81 | return diff; | ||
82 | } | ||
83 | |||
84 | void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) | ||
85 | { | ||
86 | /* We only ever get called from the connection setup code, | ||
87 | * prior to creating the QP. */ | ||
88 | BUG_ON(__rds_iw_ring_used(ring)); | ||
89 | ring->w_nr = nr; | ||
90 | } | ||
91 | |||
92 | static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) | ||
93 | { | ||
94 | return __rds_iw_ring_used(ring) == 0; | ||
95 | } | ||
96 | |||
97 | u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) | ||
98 | { | ||
99 | u32 ret = 0, avail; | ||
100 | |||
101 | avail = ring->w_nr - __rds_iw_ring_used(ring); | ||
102 | |||
103 | rdsdebug("ring %p val %u next %u free %u\n", ring, val, | ||
104 | ring->w_alloc_ptr, avail); | ||
105 | |||
106 | if (val && avail) { | ||
107 | ret = min(val, avail); | ||
108 | *pos = ring->w_alloc_ptr; | ||
109 | |||
110 | ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; | ||
111 | ring->w_alloc_ctr += ret; | ||
112 | } | ||
113 | |||
114 | return ret; | ||
115 | } | ||
116 | |||
117 | void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) | ||
118 | { | ||
119 | ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; | ||
120 | atomic_add(val, &ring->w_free_ctr); | ||
121 | |||
122 | if (__rds_iw_ring_empty(ring) && | ||
123 | waitqueue_active(&rds_iw_ring_empty_wait)) | ||
124 | wake_up(&rds_iw_ring_empty_wait); | ||
125 | } | ||
126 | |||
127 | void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) | ||
128 | { | ||
129 | ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; | ||
130 | ring->w_alloc_ctr -= val; | ||
131 | } | ||
132 | |||
133 | int rds_iw_ring_empty(struct rds_iw_work_ring *ring) | ||
134 | { | ||
135 | return __rds_iw_ring_empty(ring); | ||
136 | } | ||
137 | |||
138 | int rds_iw_ring_low(struct rds_iw_work_ring *ring) | ||
139 | { | ||
140 | return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2); | ||
141 | } | ||
142 | |||
143 | |||
144 | /* | ||
145 | * returns the oldest alloced ring entry. This will be the next one | ||
146 | * freed. This can't be called if there are none allocated. | ||
147 | */ | ||
148 | u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) | ||
149 | { | ||
150 | return ring->w_free_ptr; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * returns the number of completed work requests. | ||
155 | */ | ||
156 | |||
157 | u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) | ||
158 | { | ||
159 | u32 ret; | ||
160 | |||
161 | if (oldest <= (unsigned long long)wr_id) | ||
162 | ret = (unsigned long long)wr_id - oldest + 1; | ||
163 | else | ||
164 | ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; | ||
165 | |||
166 | rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, | ||
167 | wr_id, oldest); | ||
168 | return ret; | ||
169 | } | ||
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c new file mode 100644 index 000000000000..22dd38ffd608 --- /dev/null +++ b/net/rds/iw_send.c | |||
@@ -0,0 +1,975 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | #include <linux/device.h> | ||
36 | #include <linux/dmapool.h> | ||
37 | |||
38 | #include "rds.h" | ||
39 | #include "rdma.h" | ||
40 | #include "iw.h" | ||
41 | |||
42 | static void rds_iw_send_rdma_complete(struct rds_message *rm, | ||
43 | int wc_status) | ||
44 | { | ||
45 | int notify_status; | ||
46 | |||
47 | switch (wc_status) { | ||
48 | case IB_WC_WR_FLUSH_ERR: | ||
49 | return; | ||
50 | |||
51 | case IB_WC_SUCCESS: | ||
52 | notify_status = RDS_RDMA_SUCCESS; | ||
53 | break; | ||
54 | |||
55 | case IB_WC_REM_ACCESS_ERR: | ||
56 | notify_status = RDS_RDMA_REMOTE_ERROR; | ||
57 | break; | ||
58 | |||
59 | default: | ||
60 | notify_status = RDS_RDMA_OTHER_ERROR; | ||
61 | break; | ||
62 | } | ||
63 | rds_rdma_send_complete(rm, notify_status); | ||
64 | } | ||
65 | |||
66 | static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, | ||
67 | struct rds_rdma_op *op) | ||
68 | { | ||
69 | if (op->r_mapped) { | ||
70 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
71 | op->r_sg, op->r_nents, | ||
72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | ||
73 | op->r_mapped = 0; | ||
74 | } | ||
75 | } | ||
76 | |||
77 | static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, | ||
78 | struct rds_iw_send_work *send, | ||
79 | int wc_status) | ||
80 | { | ||
81 | struct rds_message *rm = send->s_rm; | ||
82 | |||
83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | ||
84 | |||
85 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
86 | rm->m_sg, rm->m_nents, | ||
87 | DMA_TO_DEVICE); | ||
88 | |||
89 | if (rm->m_rdma_op != NULL) { | ||
90 | rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); | ||
91 | |||
92 | /* If the user asked for a completion notification on this | ||
93 | * message, we can implement three different semantics: | ||
94 | * 1. Notify when we received the ACK on the RDS message | ||
95 | * that was queued with the RDMA. This provides reliable | ||
96 | * notification of RDMA status at the expense of a one-way | ||
97 | * packet delay. | ||
98 | * 2. Notify when the IB stack gives us the completion event for | ||
99 | * the RDMA operation. | ||
100 | * 3. Notify when the IB stack gives us the completion event for | ||
101 | * the accompanying RDS messages. | ||
102 | * Here, we implement approach #3. To implement approach #2, | ||
103 | * call rds_rdma_send_complete from the cq_handler. To implement #1, | ||
104 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
105 | * handling in the ACK processing code. | ||
106 | * | ||
107 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
108 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
109 | * operation itself unmapped the RDMA buffers, which takes care | ||
110 | * of synching. | ||
111 | */ | ||
112 | rds_iw_send_rdma_complete(rm, wc_status); | ||
113 | |||
114 | if (rm->m_rdma_op->r_write) | ||
115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | ||
116 | else | ||
117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | ||
118 | } | ||
119 | |||
120 | /* If anyone waited for this message to get flushed out, wake | ||
121 | * them up now */ | ||
122 | rds_message_unmapped(rm); | ||
123 | |||
124 | rds_message_put(rm); | ||
125 | send->s_rm = NULL; | ||
126 | } | ||
127 | |||
128 | void rds_iw_send_init_ring(struct rds_iw_connection *ic) | ||
129 | { | ||
130 | struct rds_iw_send_work *send; | ||
131 | u32 i; | ||
132 | |||
133 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | ||
134 | struct ib_sge *sge; | ||
135 | |||
136 | send->s_rm = NULL; | ||
137 | send->s_op = NULL; | ||
138 | send->s_mapping = NULL; | ||
139 | |||
140 | send->s_wr.next = NULL; | ||
141 | send->s_wr.wr_id = i; | ||
142 | send->s_wr.sg_list = send->s_sge; | ||
143 | send->s_wr.num_sge = 1; | ||
144 | send->s_wr.opcode = IB_WR_SEND; | ||
145 | send->s_wr.send_flags = 0; | ||
146 | send->s_wr.ex.imm_data = 0; | ||
147 | |||
148 | sge = rds_iw_data_sge(ic, send->s_sge); | ||
149 | sge->lkey = 0; | ||
150 | |||
151 | sge = rds_iw_header_sge(ic, send->s_sge); | ||
152 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); | ||
153 | sge->length = sizeof(struct rds_header); | ||
154 | sge->lkey = 0; | ||
155 | |||
156 | send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); | ||
157 | if (IS_ERR(send->s_mr)) { | ||
158 | printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); | ||
159 | break; | ||
160 | } | ||
161 | |||
162 | send->s_page_list = ib_alloc_fast_reg_page_list( | ||
163 | ic->i_cm_id->device, fastreg_message_size); | ||
164 | if (IS_ERR(send->s_page_list)) { | ||
165 | printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n"); | ||
166 | break; | ||
167 | } | ||
168 | } | ||
169 | } | ||
170 | |||
171 | void rds_iw_send_clear_ring(struct rds_iw_connection *ic) | ||
172 | { | ||
173 | struct rds_iw_send_work *send; | ||
174 | u32 i; | ||
175 | |||
176 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | ||
177 | BUG_ON(!send->s_mr); | ||
178 | ib_dereg_mr(send->s_mr); | ||
179 | BUG_ON(!send->s_page_list); | ||
180 | ib_free_fast_reg_page_list(send->s_page_list); | ||
181 | if (send->s_wr.opcode == 0xdead) | ||
182 | continue; | ||
183 | if (send->s_rm) | ||
184 | rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); | ||
185 | if (send->s_op) | ||
186 | rds_iw_send_unmap_rdma(ic, send->s_op); | ||
187 | } | ||
188 | } | ||
189 | |||
190 | /* | ||
191 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc | ||
192 | * operations performed in the send path. As the sender allocs and potentially | ||
193 | * unallocs the next free entry in the ring it doesn't alter which is | ||
194 | * the next to be freed, which is what this is concerned with. | ||
195 | */ | ||
196 | void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) | ||
197 | { | ||
198 | struct rds_connection *conn = context; | ||
199 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
200 | struct ib_wc wc; | ||
201 | struct rds_iw_send_work *send; | ||
202 | u32 completed; | ||
203 | u32 oldest; | ||
204 | u32 i; | ||
205 | int ret; | ||
206 | |||
207 | rdsdebug("cq %p conn %p\n", cq, conn); | ||
208 | rds_iw_stats_inc(s_iw_tx_cq_call); | ||
209 | ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); | ||
210 | if (ret) | ||
211 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | ||
212 | |||
213 | while (ib_poll_cq(cq, 1, &wc) > 0) { | ||
214 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | ||
215 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | ||
216 | be32_to_cpu(wc.ex.imm_data)); | ||
217 | rds_iw_stats_inc(s_iw_tx_cq_event); | ||
218 | |||
219 | if (wc.status != IB_WC_SUCCESS) { | ||
220 | printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); | ||
221 | break; | ||
222 | } | ||
223 | |||
224 | if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { | ||
225 | ic->i_fastreg_posted = 0; | ||
226 | continue; | ||
227 | } | ||
228 | |||
229 | if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) { | ||
230 | ic->i_fastreg_posted = 1; | ||
231 | continue; | ||
232 | } | ||
233 | |||
234 | if (wc.wr_id == RDS_IW_ACK_WR_ID) { | ||
235 | if (ic->i_ack_queued + HZ/2 < jiffies) | ||
236 | rds_iw_stats_inc(s_iw_tx_stalled); | ||
237 | rds_iw_ack_send_complete(ic); | ||
238 | continue; | ||
239 | } | ||
240 | |||
241 | oldest = rds_iw_ring_oldest(&ic->i_send_ring); | ||
242 | |||
243 | completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); | ||
244 | |||
245 | for (i = 0; i < completed; i++) { | ||
246 | send = &ic->i_sends[oldest]; | ||
247 | |||
248 | /* In the error case, wc.opcode sometimes contains garbage */ | ||
249 | switch (send->s_wr.opcode) { | ||
250 | case IB_WR_SEND: | ||
251 | if (send->s_rm) | ||
252 | rds_iw_send_unmap_rm(ic, send, wc.status); | ||
253 | break; | ||
254 | case IB_WR_FAST_REG_MR: | ||
255 | case IB_WR_RDMA_WRITE: | ||
256 | case IB_WR_RDMA_READ: | ||
257 | case IB_WR_RDMA_READ_WITH_INV: | ||
258 | /* Nothing to be done - the SG list will be unmapped | ||
259 | * when the SEND completes. */ | ||
260 | break; | ||
261 | default: | ||
262 | if (printk_ratelimit()) | ||
263 | printk(KERN_NOTICE | ||
264 | "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", | ||
265 | __func__, send->s_wr.opcode); | ||
266 | break; | ||
267 | } | ||
268 | |||
269 | send->s_wr.opcode = 0xdead; | ||
270 | send->s_wr.num_sge = 1; | ||
271 | if (send->s_queued + HZ/2 < jiffies) | ||
272 | rds_iw_stats_inc(s_iw_tx_stalled); | ||
273 | |||
274 | /* If a RDMA operation produced an error, signal this right | ||
275 | * away. If we don't, the subsequent SEND that goes with this | ||
276 | * RDMA will be canceled with ERR_WFLUSH, and the application | ||
277 | * never learn that the RDMA failed. */ | ||
278 | if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { | ||
279 | struct rds_message *rm; | ||
280 | |||
281 | rm = rds_send_get_message(conn, send->s_op); | ||
282 | if (rm) | ||
283 | rds_iw_send_rdma_complete(rm, wc.status); | ||
284 | } | ||
285 | |||
286 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; | ||
287 | } | ||
288 | |||
289 | rds_iw_ring_free(&ic->i_send_ring, completed); | ||
290 | |||
291 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) | ||
292 | || test_bit(0, &conn->c_map_queued)) | ||
293 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
294 | |||
295 | /* We expect errors as the qp is drained during shutdown */ | ||
296 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { | ||
297 | rds_iw_conn_error(conn, | ||
298 | "send completion on %pI4 " | ||
299 | "had status %u, disconnecting and reconnecting\n", | ||
300 | &conn->c_faddr, wc.status); | ||
301 | } | ||
302 | } | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * This is the main function for allocating credits when sending | ||
307 | * messages. | ||
308 | * | ||
309 | * Conceptually, we have two counters: | ||
310 | * - send credits: this tells us how many WRs we're allowed | ||
311 | * to submit without overruning the reciever's queue. For | ||
312 | * each SEND WR we post, we decrement this by one. | ||
313 | * | ||
314 | * - posted credits: this tells us how many WRs we recently | ||
315 | * posted to the receive queue. This value is transferred | ||
316 | * to the peer as a "credit update" in a RDS header field. | ||
317 | * Every time we transmit credits to the peer, we subtract | ||
318 | * the amount of transferred credits from this counter. | ||
319 | * | ||
320 | * It is essential that we avoid situations where both sides have | ||
321 | * exhausted their send credits, and are unable to send new credits | ||
322 | * to the peer. We achieve this by requiring that we send at least | ||
323 | * one credit update to the peer before exhausting our credits. | ||
324 | * When new credits arrive, we subtract one credit that is withheld | ||
325 | * until we've posted new buffers and are ready to transmit these | ||
326 | * credits (see rds_iw_send_add_credits below). | ||
327 | * | ||
328 | * The RDS send code is essentially single-threaded; rds_send_xmit | ||
329 | * grabs c_send_lock to ensure exclusive access to the send ring. | ||
330 | * However, the ACK sending code is independent and can race with | ||
331 | * message SENDs. | ||
332 | * | ||
333 | * In the send path, we need to update the counters for send credits | ||
334 | * and the counter of posted buffers atomically - when we use the | ||
335 | * last available credit, we cannot allow another thread to race us | ||
336 | * and grab the posted credits counter. Hence, we have to use a | ||
337 | * spinlock to protect the credit counter, or use atomics. | ||
338 | * | ||
339 | * Spinlocks shared between the send and the receive path are bad, | ||
340 | * because they create unnecessary delays. An early implementation | ||
341 | * using a spinlock showed a 5% degradation in throughput at some | ||
342 | * loads. | ||
343 | * | ||
344 | * This implementation avoids spinlocks completely, putting both | ||
345 | * counters into a single atomic, and updating that atomic using | ||
346 | * atomic_add (in the receive path, when receiving fresh credits), | ||
347 | * and using atomic_cmpxchg when updating the two counters. | ||
348 | */ | ||
349 | int rds_iw_send_grab_credits(struct rds_iw_connection *ic, | ||
350 | u32 wanted, u32 *adv_credits, int need_posted) | ||
351 | { | ||
352 | unsigned int avail, posted, got = 0, advertise; | ||
353 | long oldval, newval; | ||
354 | |||
355 | *adv_credits = 0; | ||
356 | if (!ic->i_flowctl) | ||
357 | return wanted; | ||
358 | |||
359 | try_again: | ||
360 | advertise = 0; | ||
361 | oldval = newval = atomic_read(&ic->i_credits); | ||
362 | posted = IB_GET_POST_CREDITS(oldval); | ||
363 | avail = IB_GET_SEND_CREDITS(oldval); | ||
364 | |||
365 | rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n", | ||
366 | wanted, avail, posted); | ||
367 | |||
368 | /* The last credit must be used to send a credit update. */ | ||
369 | if (avail && !posted) | ||
370 | avail--; | ||
371 | |||
372 | if (avail < wanted) { | ||
373 | struct rds_connection *conn = ic->i_cm_id->context; | ||
374 | |||
375 | /* Oops, there aren't that many credits left! */ | ||
376 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | ||
377 | got = avail; | ||
378 | } else { | ||
379 | /* Sometimes you get what you want, lalala. */ | ||
380 | got = wanted; | ||
381 | } | ||
382 | newval -= IB_SET_SEND_CREDITS(got); | ||
383 | |||
384 | /* | ||
385 | * If need_posted is non-zero, then the caller wants | ||
386 | * the posted regardless of whether any send credits are | ||
387 | * available. | ||
388 | */ | ||
389 | if (posted && (got || need_posted)) { | ||
390 | advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); | ||
391 | newval -= IB_SET_POST_CREDITS(advertise); | ||
392 | } | ||
393 | |||
394 | /* Finally bill everything */ | ||
395 | if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) | ||
396 | goto try_again; | ||
397 | |||
398 | *adv_credits = advertise; | ||
399 | return got; | ||
400 | } | ||
401 | |||
402 | void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) | ||
403 | { | ||
404 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
405 | |||
406 | if (credits == 0) | ||
407 | return; | ||
408 | |||
409 | rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n", | ||
410 | credits, | ||
411 | IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), | ||
412 | test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); | ||
413 | |||
414 | atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); | ||
415 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) | ||
416 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
417 | |||
418 | WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); | ||
419 | |||
420 | rds_iw_stats_inc(s_iw_rx_credit_updates); | ||
421 | } | ||
422 | |||
423 | void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) | ||
424 | { | ||
425 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
426 | |||
427 | if (posted == 0) | ||
428 | return; | ||
429 | |||
430 | atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); | ||
431 | |||
432 | /* Decide whether to send an update to the peer now. | ||
433 | * If we would send a credit update for every single buffer we | ||
434 | * post, we would end up with an ACK storm (ACK arrives, | ||
435 | * consumes buffer, we refill the ring, send ACK to remote | ||
436 | * advertising the newly posted buffer... ad inf) | ||
437 | * | ||
438 | * Performance pretty much depends on how often we send | ||
439 | * credit updates - too frequent updates mean lots of ACKs. | ||
440 | * Too infrequent updates, and the peer will run out of | ||
441 | * credits and has to throttle. | ||
442 | * For the time being, 16 seems to be a good compromise. | ||
443 | */ | ||
444 | if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) | ||
445 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | ||
446 | } | ||
447 | |||
448 | static inline void | ||
449 | rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, | ||
450 | struct rds_iw_send_work *send, unsigned int pos, | ||
451 | unsigned long buffer, unsigned int length, | ||
452 | int send_flags) | ||
453 | { | ||
454 | struct ib_sge *sge; | ||
455 | |||
456 | WARN_ON(pos != send - ic->i_sends); | ||
457 | |||
458 | send->s_wr.send_flags = send_flags; | ||
459 | send->s_wr.opcode = IB_WR_SEND; | ||
460 | send->s_wr.num_sge = 2; | ||
461 | send->s_wr.next = NULL; | ||
462 | send->s_queued = jiffies; | ||
463 | send->s_op = NULL; | ||
464 | |||
465 | if (length != 0) { | ||
466 | sge = rds_iw_data_sge(ic, send->s_sge); | ||
467 | sge->addr = buffer; | ||
468 | sge->length = length; | ||
469 | sge->lkey = rds_iw_local_dma_lkey(ic); | ||
470 | |||
471 | sge = rds_iw_header_sge(ic, send->s_sge); | ||
472 | } else { | ||
473 | /* We're sending a packet with no payload. There is only | ||
474 | * one SGE */ | ||
475 | send->s_wr.num_sge = 1; | ||
476 | sge = &send->s_sge[0]; | ||
477 | } | ||
478 | |||
479 | sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); | ||
480 | sge->length = sizeof(struct rds_header); | ||
481 | sge->lkey = rds_iw_local_dma_lkey(ic); | ||
482 | } | ||
483 | |||
484 | /* | ||
485 | * This can be called multiple times for a given message. The first time | ||
486 | * we see a message we map its scatterlist into the IB device so that | ||
487 | * we can provide that mapped address to the IB scatter gather entries | ||
488 | * in the IB work requests. We translate the scatterlist into a series | ||
489 | * of work requests that fragment the message. These work requests complete | ||
490 | * in order so we pass ownership of the message to the completion handler | ||
491 | * once we send the final fragment. | ||
492 | * | ||
493 | * The RDS core uses the c_send_lock to only enter this function once | ||
494 | * per connection. This makes sure that the tx ring alloc/unalloc pairs | ||
495 | * don't get out of sync and confuse the ring. | ||
496 | */ | ||
497 | int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
498 | unsigned int hdr_off, unsigned int sg, unsigned int off) | ||
499 | { | ||
500 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
501 | struct ib_device *dev = ic->i_cm_id->device; | ||
502 | struct rds_iw_send_work *send = NULL; | ||
503 | struct rds_iw_send_work *first; | ||
504 | struct rds_iw_send_work *prev; | ||
505 | struct ib_send_wr *failed_wr; | ||
506 | struct scatterlist *scat; | ||
507 | u32 pos; | ||
508 | u32 i; | ||
509 | u32 work_alloc; | ||
510 | u32 credit_alloc; | ||
511 | u32 posted; | ||
512 | u32 adv_credits = 0; | ||
513 | int send_flags = 0; | ||
514 | int sent; | ||
515 | int ret; | ||
516 | int flow_controlled = 0; | ||
517 | |||
518 | BUG_ON(off % RDS_FRAG_SIZE); | ||
519 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); | ||
520 | |||
521 | /* Fastreg support */ | ||
522 | if (rds_rdma_cookie_key(rm->m_rdma_cookie) | ||
523 | && !ic->i_fastreg_posted) { | ||
524 | ret = -EAGAIN; | ||
525 | goto out; | ||
526 | } | ||
527 | |||
528 | /* FIXME we may overallocate here */ | ||
529 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) | ||
530 | i = 1; | ||
531 | else | ||
532 | i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); | ||
533 | |||
534 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); | ||
535 | if (work_alloc == 0) { | ||
536 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | ||
537 | rds_iw_stats_inc(s_iw_tx_ring_full); | ||
538 | ret = -ENOMEM; | ||
539 | goto out; | ||
540 | } | ||
541 | |||
542 | credit_alloc = work_alloc; | ||
543 | if (ic->i_flowctl) { | ||
544 | credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0); | ||
545 | adv_credits += posted; | ||
546 | if (credit_alloc < work_alloc) { | ||
547 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); | ||
548 | work_alloc = credit_alloc; | ||
549 | flow_controlled++; | ||
550 | } | ||
551 | if (work_alloc == 0) { | ||
552 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
553 | rds_iw_stats_inc(s_iw_tx_throttle); | ||
554 | ret = -ENOMEM; | ||
555 | goto out; | ||
556 | } | ||
557 | } | ||
558 | |||
559 | /* map the message the first time we see it */ | ||
560 | if (ic->i_rm == NULL) { | ||
561 | /* | ||
562 | printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", | ||
563 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | ||
564 | rm->m_inc.i_hdr.h_flags, | ||
565 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | ||
566 | */ | ||
567 | if (rm->m_nents) { | ||
568 | rm->m_count = ib_dma_map_sg(dev, | ||
569 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | ||
570 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | ||
571 | if (rm->m_count == 0) { | ||
572 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); | ||
573 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
574 | ret = -ENOMEM; /* XXX ? */ | ||
575 | goto out; | ||
576 | } | ||
577 | } else { | ||
578 | rm->m_count = 0; | ||
579 | } | ||
580 | |||
581 | ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; | ||
582 | ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; | ||
583 | rds_message_addref(rm); | ||
584 | ic->i_rm = rm; | ||
585 | |||
586 | /* Finalize the header */ | ||
587 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) | ||
588 | rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; | ||
589 | if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) | ||
590 | rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; | ||
591 | |||
592 | /* If it has a RDMA op, tell the peer we did it. This is | ||
593 | * used by the peer to release use-once RDMA MRs. */ | ||
594 | if (rm->m_rdma_op) { | ||
595 | struct rds_ext_header_rdma ext_hdr; | ||
596 | |||
597 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | ||
598 | rds_message_add_extension(&rm->m_inc.i_hdr, | ||
599 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | ||
600 | } | ||
601 | if (rm->m_rdma_cookie) { | ||
602 | rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, | ||
603 | rds_rdma_cookie_key(rm->m_rdma_cookie), | ||
604 | rds_rdma_cookie_offset(rm->m_rdma_cookie)); | ||
605 | } | ||
606 | |||
607 | /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so | ||
608 | * we should not do this unless we have a chance of at least | ||
609 | * sticking the header into the send ring. Which is why we | ||
610 | * should call rds_iw_ring_alloc first. */ | ||
611 | rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); | ||
612 | rds_message_make_checksum(&rm->m_inc.i_hdr); | ||
613 | |||
614 | /* | ||
615 | * Update adv_credits since we reset the ACK_REQUIRED bit. | ||
616 | */ | ||
617 | rds_iw_send_grab_credits(ic, 0, &posted, 1); | ||
618 | adv_credits += posted; | ||
619 | BUG_ON(adv_credits > 255); | ||
620 | } else if (ic->i_rm != rm) | ||
621 | BUG(); | ||
622 | |||
623 | send = &ic->i_sends[pos]; | ||
624 | first = send; | ||
625 | prev = NULL; | ||
626 | scat = &rm->m_sg[sg]; | ||
627 | sent = 0; | ||
628 | i = 0; | ||
629 | |||
630 | /* Sometimes you want to put a fence between an RDMA | ||
631 | * READ and the following SEND. | ||
632 | * We could either do this all the time | ||
633 | * or when requested by the user. Right now, we let | ||
634 | * the application choose. | ||
635 | */ | ||
636 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | ||
637 | send_flags = IB_SEND_FENCE; | ||
638 | |||
639 | /* | ||
640 | * We could be copying the header into the unused tail of the page. | ||
641 | * That would need to be changed in the future when those pages might | ||
642 | * be mapped userspace pages or page cache pages. So instead we always | ||
643 | * use a second sge and our long-lived ring of mapped headers. We send | ||
644 | * the header after the data so that the data payload can be aligned on | ||
645 | * the receiver. | ||
646 | */ | ||
647 | |||
648 | /* handle a 0-len message */ | ||
649 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { | ||
650 | rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); | ||
651 | goto add_header; | ||
652 | } | ||
653 | |||
654 | /* if there's data reference it with a chain of work reqs */ | ||
655 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | ||
656 | unsigned int len; | ||
657 | |||
658 | send = &ic->i_sends[pos]; | ||
659 | |||
660 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); | ||
661 | rds_iw_xmit_populate_wr(ic, send, pos, | ||
662 | ib_sg_dma_address(dev, scat) + off, len, | ||
663 | send_flags); | ||
664 | |||
665 | /* | ||
666 | * We want to delay signaling completions just enough to get | ||
667 | * the batching benefits but not so much that we create dead time | ||
668 | * on the wire. | ||
669 | */ | ||
670 | if (ic->i_unsignaled_wrs-- == 0) { | ||
671 | ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; | ||
672 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
673 | } | ||
674 | |||
675 | ic->i_unsignaled_bytes -= len; | ||
676 | if (ic->i_unsignaled_bytes <= 0) { | ||
677 | ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; | ||
678 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
679 | } | ||
680 | |||
681 | /* | ||
682 | * Always signal the last one if we're stopping due to flow control. | ||
683 | */ | ||
684 | if (flow_controlled && i == (work_alloc-1)) | ||
685 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
686 | |||
687 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | ||
688 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | ||
689 | |||
690 | sent += len; | ||
691 | off += len; | ||
692 | if (off == ib_sg_dma_len(dev, scat)) { | ||
693 | scat++; | ||
694 | off = 0; | ||
695 | } | ||
696 | |||
697 | add_header: | ||
698 | /* Tack on the header after the data. The header SGE should already | ||
699 | * have been set up to point to the right header buffer. */ | ||
700 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); | ||
701 | |||
702 | if (0) { | ||
703 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
704 | |||
705 | printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", | ||
706 | be16_to_cpu(hdr->h_dport), | ||
707 | hdr->h_flags, | ||
708 | be32_to_cpu(hdr->h_len)); | ||
709 | } | ||
710 | if (adv_credits) { | ||
711 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
712 | |||
713 | /* add credit and redo the header checksum */ | ||
714 | hdr->h_credit = adv_credits; | ||
715 | rds_message_make_checksum(hdr); | ||
716 | adv_credits = 0; | ||
717 | rds_iw_stats_inc(s_iw_tx_credit_updates); | ||
718 | } | ||
719 | |||
720 | if (prev) | ||
721 | prev->s_wr.next = &send->s_wr; | ||
722 | prev = send; | ||
723 | |||
724 | pos = (pos + 1) % ic->i_send_ring.w_nr; | ||
725 | } | ||
726 | |||
727 | /* Account the RDS header in the number of bytes we sent, but just once. | ||
728 | * The caller has no concept of fragmentation. */ | ||
729 | if (hdr_off == 0) | ||
730 | sent += sizeof(struct rds_header); | ||
731 | |||
732 | /* if we finished the message then send completion owns it */ | ||
733 | if (scat == &rm->m_sg[rm->m_count]) { | ||
734 | prev->s_rm = ic->i_rm; | ||
735 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
736 | ic->i_rm = NULL; | ||
737 | } | ||
738 | |||
739 | if (i < work_alloc) { | ||
740 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); | ||
741 | work_alloc = i; | ||
742 | } | ||
743 | if (ic->i_flowctl && i < credit_alloc) | ||
744 | rds_iw_send_add_credits(conn, credit_alloc - i); | ||
745 | |||
746 | /* XXX need to worry about failed_wr and partial sends. */ | ||
747 | failed_wr = &first->s_wr; | ||
748 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | ||
749 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | ||
750 | first, &first->s_wr, ret, failed_wr); | ||
751 | BUG_ON(failed_wr != &first->s_wr); | ||
752 | if (ret) { | ||
753 | printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " | ||
754 | "returned %d\n", &conn->c_faddr, ret); | ||
755 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
756 | if (prev->s_rm) { | ||
757 | ic->i_rm = prev->s_rm; | ||
758 | prev->s_rm = NULL; | ||
759 | } | ||
760 | goto out; | ||
761 | } | ||
762 | |||
763 | ret = sent; | ||
764 | out: | ||
765 | BUG_ON(adv_credits); | ||
766 | return ret; | ||
767 | } | ||
768 | |||
769 | static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr) | ||
770 | { | ||
771 | BUG_ON(nent > send->s_page_list->max_page_list_len); | ||
772 | /* | ||
773 | * Perform a WR for the fast_reg_mr. Each individual page | ||
774 | * in the sg list is added to the fast reg page list and placed | ||
775 | * inside the fast_reg_mr WR. | ||
776 | */ | ||
777 | send->s_wr.opcode = IB_WR_FAST_REG_MR; | ||
778 | send->s_wr.wr.fast_reg.length = len; | ||
779 | send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey; | ||
780 | send->s_wr.wr.fast_reg.page_list = send->s_page_list; | ||
781 | send->s_wr.wr.fast_reg.page_list_len = nent; | ||
782 | send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift; | ||
783 | send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE; | ||
784 | send->s_wr.wr.fast_reg.iova_start = sg_addr; | ||
785 | |||
786 | ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); | ||
787 | } | ||
788 | |||
789 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | ||
790 | { | ||
791 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
792 | struct rds_iw_send_work *send = NULL; | ||
793 | struct rds_iw_send_work *first; | ||
794 | struct rds_iw_send_work *prev; | ||
795 | struct ib_send_wr *failed_wr; | ||
796 | struct rds_iw_device *rds_iwdev; | ||
797 | struct scatterlist *scat; | ||
798 | unsigned long len; | ||
799 | u64 remote_addr = op->r_remote_addr; | ||
800 | u32 pos, fr_pos; | ||
801 | u32 work_alloc; | ||
802 | u32 i; | ||
803 | u32 j; | ||
804 | int sent; | ||
805 | int ret; | ||
806 | int num_sge; | ||
807 | |||
808 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); | ||
809 | |||
810 | /* map the message the first time we see it */ | ||
811 | if (!op->r_mapped) { | ||
812 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | ||
813 | op->r_sg, op->r_nents, (op->r_write) ? | ||
814 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | ||
815 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | ||
816 | if (op->r_count == 0) { | ||
817 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); | ||
818 | ret = -ENOMEM; /* XXX ? */ | ||
819 | goto out; | ||
820 | } | ||
821 | |||
822 | op->r_mapped = 1; | ||
823 | } | ||
824 | |||
825 | if (!op->r_write) { | ||
826 | /* Alloc space on the send queue for the fastreg */ | ||
827 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); | ||
828 | if (work_alloc != 1) { | ||
829 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
830 | rds_iw_stats_inc(s_iw_tx_ring_full); | ||
831 | ret = -ENOMEM; | ||
832 | goto out; | ||
833 | } | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Instead of knowing how to return a partial rdma read/write we insist that there | ||
838 | * be enough work requests to send the entire message. | ||
839 | */ | ||
840 | i = ceil(op->r_count, rds_iwdev->max_sge); | ||
841 | |||
842 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); | ||
843 | if (work_alloc != i) { | ||
844 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
845 | rds_iw_stats_inc(s_iw_tx_ring_full); | ||
846 | ret = -ENOMEM; | ||
847 | goto out; | ||
848 | } | ||
849 | |||
850 | send = &ic->i_sends[pos]; | ||
851 | if (!op->r_write) { | ||
852 | first = prev = &ic->i_sends[fr_pos]; | ||
853 | } else { | ||
854 | first = send; | ||
855 | prev = NULL; | ||
856 | } | ||
857 | scat = &op->r_sg[0]; | ||
858 | sent = 0; | ||
859 | num_sge = op->r_count; | ||
860 | |||
861 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | ||
862 | send->s_wr.send_flags = 0; | ||
863 | send->s_queued = jiffies; | ||
864 | |||
865 | /* | ||
866 | * We want to delay signaling completions just enough to get | ||
867 | * the batching benefits but not so much that we create dead time on the wire. | ||
868 | */ | ||
869 | if (ic->i_unsignaled_wrs-- == 0) { | ||
870 | ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; | ||
871 | send->s_wr.send_flags = IB_SEND_SIGNALED; | ||
872 | } | ||
873 | |||
874 | /* To avoid the need to have the plumbing to invalidate the fastreg_mr used | ||
875 | * for local access after RDS is finished with it, using | ||
876 | * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. | ||
877 | */ | ||
878 | if (op->r_write) | ||
879 | send->s_wr.opcode = IB_WR_RDMA_WRITE; | ||
880 | else | ||
881 | send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; | ||
882 | |||
883 | send->s_wr.wr.rdma.remote_addr = remote_addr; | ||
884 | send->s_wr.wr.rdma.rkey = op->r_key; | ||
885 | send->s_op = op; | ||
886 | |||
887 | if (num_sge > rds_iwdev->max_sge) { | ||
888 | send->s_wr.num_sge = rds_iwdev->max_sge; | ||
889 | num_sge -= rds_iwdev->max_sge; | ||
890 | } else | ||
891 | send->s_wr.num_sge = num_sge; | ||
892 | |||
893 | send->s_wr.next = NULL; | ||
894 | |||
895 | if (prev) | ||
896 | prev->s_wr.next = &send->s_wr; | ||
897 | |||
898 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | ||
899 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | ||
900 | |||
901 | if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) | ||
902 | send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat); | ||
903 | else { | ||
904 | send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); | ||
905 | send->s_sge[j].length = len; | ||
906 | send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); | ||
907 | } | ||
908 | |||
909 | sent += len; | ||
910 | rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); | ||
911 | remote_addr += len; | ||
912 | |||
913 | scat++; | ||
914 | } | ||
915 | |||
916 | if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) { | ||
917 | send->s_wr.num_sge = 1; | ||
918 | send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; | ||
919 | send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; | ||
920 | send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; | ||
921 | } | ||
922 | |||
923 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | ||
924 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | ||
925 | |||
926 | prev = send; | ||
927 | if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) | ||
928 | send = ic->i_sends; | ||
929 | } | ||
930 | |||
931 | /* if we finished the message then send completion owns it */ | ||
932 | if (scat == &op->r_sg[op->r_count]) | ||
933 | first->s_wr.send_flags = IB_SEND_SIGNALED; | ||
934 | |||
935 | if (i < work_alloc) { | ||
936 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); | ||
937 | work_alloc = i; | ||
938 | } | ||
939 | |||
940 | /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not | ||
941 | * recommended. Putting the lkey on the wire is a security hole, as it can | ||
942 | * allow for memory access to all of memory on the remote system. Some | ||
943 | * adapters do not allow using the lkey for this at all. To bypass this use a | ||
944 | * fastreg_mr (or possibly a dma_mr) | ||
945 | */ | ||
946 | if (!op->r_write) { | ||
947 | rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], | ||
948 | op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); | ||
949 | work_alloc++; | ||
950 | } | ||
951 | |||
952 | failed_wr = &first->s_wr; | ||
953 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | ||
954 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | ||
955 | first, &first->s_wr, ret, failed_wr); | ||
956 | BUG_ON(failed_wr != &first->s_wr); | ||
957 | if (ret) { | ||
958 | printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " | ||
959 | "returned %d\n", &conn->c_faddr, ret); | ||
960 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
961 | goto out; | ||
962 | } | ||
963 | |||
964 | out: | ||
965 | return ret; | ||
966 | } | ||
967 | |||
968 | void rds_iw_xmit_complete(struct rds_connection *conn) | ||
969 | { | ||
970 | struct rds_iw_connection *ic = conn->c_transport_data; | ||
971 | |||
972 | /* We may have a pending ACK or window update we were unable | ||
973 | * to send previously (due to flow control). Try again. */ | ||
974 | rds_iw_attempt_ack(ic); | ||
975 | } | ||
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c new file mode 100644 index 000000000000..ccc7e8f0bf0e --- /dev/null +++ b/net/rds/iw_stats.c | |||
@@ -0,0 +1,95 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/percpu.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "iw.h" | ||
39 | |||
40 | DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; | ||
41 | |||
42 | static char *rds_iw_stat_names[] = { | ||
43 | "iw_connect_raced", | ||
44 | "iw_listen_closed_stale", | ||
45 | "iw_tx_cq_call", | ||
46 | "iw_tx_cq_event", | ||
47 | "iw_tx_ring_full", | ||
48 | "iw_tx_throttle", | ||
49 | "iw_tx_sg_mapping_failure", | ||
50 | "iw_tx_stalled", | ||
51 | "iw_tx_credit_updates", | ||
52 | "iw_rx_cq_call", | ||
53 | "iw_rx_cq_event", | ||
54 | "iw_rx_ring_empty", | ||
55 | "iw_rx_refill_from_cq", | ||
56 | "iw_rx_refill_from_thread", | ||
57 | "iw_rx_alloc_limit", | ||
58 | "iw_rx_credit_updates", | ||
59 | "iw_ack_sent", | ||
60 | "iw_ack_send_failure", | ||
61 | "iw_ack_send_delayed", | ||
62 | "iw_ack_send_piggybacked", | ||
63 | "iw_ack_received", | ||
64 | "iw_rdma_mr_alloc", | ||
65 | "iw_rdma_mr_free", | ||
66 | "iw_rdma_mr_used", | ||
67 | "iw_rdma_mr_pool_flush", | ||
68 | "iw_rdma_mr_pool_wait", | ||
69 | "iw_rdma_mr_pool_depleted", | ||
70 | }; | ||
71 | |||
72 | unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, | ||
73 | unsigned int avail) | ||
74 | { | ||
75 | struct rds_iw_statistics stats = {0, }; | ||
76 | uint64_t *src; | ||
77 | uint64_t *sum; | ||
78 | size_t i; | ||
79 | int cpu; | ||
80 | |||
81 | if (avail < ARRAY_SIZE(rds_iw_stat_names)) | ||
82 | goto out; | ||
83 | |||
84 | for_each_online_cpu(cpu) { | ||
85 | src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); | ||
86 | sum = (uint64_t *)&stats; | ||
87 | for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) | ||
88 | *(sum++) += *(src++); | ||
89 | } | ||
90 | |||
91 | rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, | ||
92 | ARRAY_SIZE(rds_iw_stat_names)); | ||
93 | out: | ||
94 | return ARRAY_SIZE(rds_iw_stat_names); | ||
95 | } | ||
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c new file mode 100644 index 000000000000..9590678cd616 --- /dev/null +++ b/net/rds/iw_sysctl.c | |||
@@ -0,0 +1,137 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/sysctl.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "iw.h" | ||
38 | |||
39 | static struct ctl_table_header *rds_iw_sysctl_hdr; | ||
40 | |||
41 | unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; | ||
42 | unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; | ||
43 | unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; | ||
44 | static unsigned long rds_iw_sysctl_max_wr_min = 1; | ||
45 | /* hardware will fail CQ creation long before this */ | ||
46 | static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; | ||
47 | |||
48 | unsigned long rds_iw_sysctl_max_unsig_wrs = 16; | ||
49 | static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; | ||
50 | static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; | ||
51 | |||
52 | unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); | ||
53 | static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; | ||
54 | static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; | ||
55 | |||
56 | unsigned int rds_iw_sysctl_flow_control = 1; | ||
57 | |||
58 | ctl_table rds_iw_sysctl_table[] = { | ||
59 | { | ||
60 | .ctl_name = CTL_UNNUMBERED, | ||
61 | .procname = "max_send_wr", | ||
62 | .data = &rds_iw_sysctl_max_send_wr, | ||
63 | .maxlen = sizeof(unsigned long), | ||
64 | .mode = 0644, | ||
65 | .proc_handler = &proc_doulongvec_minmax, | ||
66 | .extra1 = &rds_iw_sysctl_max_wr_min, | ||
67 | .extra2 = &rds_iw_sysctl_max_wr_max, | ||
68 | }, | ||
69 | { | ||
70 | .ctl_name = CTL_UNNUMBERED, | ||
71 | .procname = "max_recv_wr", | ||
72 | .data = &rds_iw_sysctl_max_recv_wr, | ||
73 | .maxlen = sizeof(unsigned long), | ||
74 | .mode = 0644, | ||
75 | .proc_handler = &proc_doulongvec_minmax, | ||
76 | .extra1 = &rds_iw_sysctl_max_wr_min, | ||
77 | .extra2 = &rds_iw_sysctl_max_wr_max, | ||
78 | }, | ||
79 | { | ||
80 | .ctl_name = CTL_UNNUMBERED, | ||
81 | .procname = "max_unsignaled_wr", | ||
82 | .data = &rds_iw_sysctl_max_unsig_wrs, | ||
83 | .maxlen = sizeof(unsigned long), | ||
84 | .mode = 0644, | ||
85 | .proc_handler = &proc_doulongvec_minmax, | ||
86 | .extra1 = &rds_iw_sysctl_max_unsig_wr_min, | ||
87 | .extra2 = &rds_iw_sysctl_max_unsig_wr_max, | ||
88 | }, | ||
89 | { | ||
90 | .ctl_name = CTL_UNNUMBERED, | ||
91 | .procname = "max_unsignaled_bytes", | ||
92 | .data = &rds_iw_sysctl_max_unsig_bytes, | ||
93 | .maxlen = sizeof(unsigned long), | ||
94 | .mode = 0644, | ||
95 | .proc_handler = &proc_doulongvec_minmax, | ||
96 | .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, | ||
97 | .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, | ||
98 | }, | ||
99 | { | ||
100 | .ctl_name = CTL_UNNUMBERED, | ||
101 | .procname = "max_recv_allocation", | ||
102 | .data = &rds_iw_sysctl_max_recv_allocation, | ||
103 | .maxlen = sizeof(unsigned long), | ||
104 | .mode = 0644, | ||
105 | .proc_handler = &proc_doulongvec_minmax, | ||
106 | }, | ||
107 | { | ||
108 | .ctl_name = CTL_UNNUMBERED, | ||
109 | .procname = "flow_control", | ||
110 | .data = &rds_iw_sysctl_flow_control, | ||
111 | .maxlen = sizeof(rds_iw_sysctl_flow_control), | ||
112 | .mode = 0644, | ||
113 | .proc_handler = &proc_dointvec, | ||
114 | }, | ||
115 | { .ctl_name = 0} | ||
116 | }; | ||
117 | |||
118 | static struct ctl_path rds_iw_sysctl_path[] = { | ||
119 | { .procname = "net", .ctl_name = CTL_NET, }, | ||
120 | { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, | ||
121 | { .procname = "iw", .ctl_name = CTL_UNNUMBERED, }, | ||
122 | { } | ||
123 | }; | ||
124 | |||
125 | void rds_iw_sysctl_exit(void) | ||
126 | { | ||
127 | if (rds_iw_sysctl_hdr) | ||
128 | unregister_sysctl_table(rds_iw_sysctl_hdr); | ||
129 | } | ||
130 | |||
131 | int __init rds_iw_sysctl_init(void) | ||
132 | { | ||
133 | rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); | ||
134 | if (rds_iw_sysctl_hdr == NULL) | ||
135 | return -ENOMEM; | ||
136 | return 0; | ||
137 | } | ||
diff --git a/net/rds/loop.c b/net/rds/loop.c new file mode 100644 index 000000000000..4a61997f554d --- /dev/null +++ b/net/rds/loop.c | |||
@@ -0,0 +1,188 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/in.h> | ||
35 | |||
36 | #include "rds.h" | ||
37 | #include "loop.h" | ||
38 | |||
39 | static DEFINE_SPINLOCK(loop_conns_lock); | ||
40 | static LIST_HEAD(loop_conns); | ||
41 | |||
42 | /* | ||
43 | * This 'loopback' transport is a special case for flows that originate | ||
44 | * and terminate on the same machine. | ||
45 | * | ||
46 | * Connection build-up notices if the destination address is thought of | ||
47 | * as a local address by a transport. At that time it decides to use the | ||
48 | * loopback transport instead of the bound transport of the sending socket. | ||
49 | * | ||
50 | * The loopback transport's sending path just hands the sent rds_message | ||
51 | * straight to the receiving path via an embedded rds_incoming. | ||
52 | */ | ||
53 | |||
54 | /* | ||
55 | * Usually a message transits both the sender and receiver's conns as it | ||
56 | * flows to the receiver. In the loopback case, though, the receive path | ||
57 | * is handed the sending conn so the sense of the addresses is reversed. | ||
58 | */ | ||
59 | static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, | ||
60 | unsigned int hdr_off, unsigned int sg, | ||
61 | unsigned int off) | ||
62 | { | ||
63 | BUG_ON(hdr_off || sg || off); | ||
64 | |||
65 | rds_inc_init(&rm->m_inc, conn, conn->c_laddr); | ||
66 | rds_message_addref(rm); /* for the inc */ | ||
67 | |||
68 | rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, | ||
69 | GFP_KERNEL, KM_USER0); | ||
70 | |||
71 | rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), | ||
72 | NULL); | ||
73 | |||
74 | rds_inc_put(&rm->m_inc); | ||
75 | |||
76 | return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); | ||
77 | } | ||
78 | |||
79 | static int rds_loop_xmit_cong_map(struct rds_connection *conn, | ||
80 | struct rds_cong_map *map, | ||
81 | unsigned long offset) | ||
82 | { | ||
83 | unsigned long i; | ||
84 | |||
85 | BUG_ON(offset); | ||
86 | BUG_ON(map != conn->c_lcong); | ||
87 | |||
88 | for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { | ||
89 | memcpy((void *)conn->c_fcong->m_page_addrs[i], | ||
90 | (void *)map->m_page_addrs[i], PAGE_SIZE); | ||
91 | } | ||
92 | |||
93 | rds_cong_map_updated(conn->c_fcong, ~(u64) 0); | ||
94 | |||
95 | return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; | ||
96 | } | ||
97 | |||
98 | /* we need to at least give the thread something to succeed */ | ||
99 | static int rds_loop_recv(struct rds_connection *conn) | ||
100 | { | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | struct rds_loop_connection { | ||
105 | struct list_head loop_node; | ||
106 | struct rds_connection *conn; | ||
107 | }; | ||
108 | |||
109 | /* | ||
110 | * Even the loopback transport needs to keep track of its connections, | ||
111 | * so it can call rds_conn_destroy() on them on exit. N.B. there are | ||
112 | * 1+ loopback addresses (127.*.*.*) so it's not a bug to have | ||
113 | * multiple loopback conns allocated, although rather useless. | ||
114 | */ | ||
115 | static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) | ||
116 | { | ||
117 | struct rds_loop_connection *lc; | ||
118 | unsigned long flags; | ||
119 | |||
120 | lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); | ||
121 | if (lc == NULL) | ||
122 | return -ENOMEM; | ||
123 | |||
124 | INIT_LIST_HEAD(&lc->loop_node); | ||
125 | lc->conn = conn; | ||
126 | conn->c_transport_data = lc; | ||
127 | |||
128 | spin_lock_irqsave(&loop_conns_lock, flags); | ||
129 | list_add_tail(&lc->loop_node, &loop_conns); | ||
130 | spin_unlock_irqrestore(&loop_conns_lock, flags); | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | static void rds_loop_conn_free(void *arg) | ||
136 | { | ||
137 | struct rds_loop_connection *lc = arg; | ||
138 | rdsdebug("lc %p\n", lc); | ||
139 | list_del(&lc->loop_node); | ||
140 | kfree(lc); | ||
141 | } | ||
142 | |||
143 | static int rds_loop_conn_connect(struct rds_connection *conn) | ||
144 | { | ||
145 | rds_connect_complete(conn); | ||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | static void rds_loop_conn_shutdown(struct rds_connection *conn) | ||
150 | { | ||
151 | } | ||
152 | |||
153 | void rds_loop_exit(void) | ||
154 | { | ||
155 | struct rds_loop_connection *lc, *_lc; | ||
156 | LIST_HEAD(tmp_list); | ||
157 | |||
158 | /* avoid calling conn_destroy with irqs off */ | ||
159 | spin_lock_irq(&loop_conns_lock); | ||
160 | list_splice(&loop_conns, &tmp_list); | ||
161 | INIT_LIST_HEAD(&loop_conns); | ||
162 | spin_unlock_irq(&loop_conns_lock); | ||
163 | |||
164 | list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { | ||
165 | WARN_ON(lc->conn->c_passive); | ||
166 | rds_conn_destroy(lc->conn); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * This is missing .xmit_* because loop doesn't go through generic | ||
172 | * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and | ||
173 | * .laddr_check are missing because transport.c doesn't iterate over | ||
174 | * rds_loop_transport. | ||
175 | */ | ||
176 | struct rds_transport rds_loop_transport = { | ||
177 | .xmit = rds_loop_xmit, | ||
178 | .xmit_cong_map = rds_loop_xmit_cong_map, | ||
179 | .recv = rds_loop_recv, | ||
180 | .conn_alloc = rds_loop_conn_alloc, | ||
181 | .conn_free = rds_loop_conn_free, | ||
182 | .conn_connect = rds_loop_conn_connect, | ||
183 | .conn_shutdown = rds_loop_conn_shutdown, | ||
184 | .inc_copy_to_user = rds_message_inc_copy_to_user, | ||
185 | .inc_purge = rds_message_inc_purge, | ||
186 | .inc_free = rds_message_inc_free, | ||
187 | .t_name = "loopback", | ||
188 | }; | ||
diff --git a/net/rds/loop.h b/net/rds/loop.h new file mode 100644 index 000000000000..f32b0939a04d --- /dev/null +++ b/net/rds/loop.h | |||
@@ -0,0 +1,9 @@ | |||
1 | #ifndef _RDS_LOOP_H | ||
2 | #define _RDS_LOOP_H | ||
3 | |||
4 | /* loop.c */ | ||
5 | extern struct rds_transport rds_loop_transport; | ||
6 | |||
7 | void rds_loop_exit(void); | ||
8 | |||
9 | #endif | ||
diff --git a/net/rds/message.c b/net/rds/message.c new file mode 100644 index 000000000000..5a15dc8d0cd7 --- /dev/null +++ b/net/rds/message.c | |||
@@ -0,0 +1,402 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | #include "rdma.h" | ||
37 | |||
38 | static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); | ||
39 | |||
40 | static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { | ||
41 | [RDS_EXTHDR_NONE] = 0, | ||
42 | [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), | ||
43 | [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), | ||
44 | [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), | ||
45 | }; | ||
46 | |||
47 | |||
48 | void rds_message_addref(struct rds_message *rm) | ||
49 | { | ||
50 | rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); | ||
51 | atomic_inc(&rm->m_refcount); | ||
52 | } | ||
53 | |||
54 | /* | ||
55 | * This relies on dma_map_sg() not touching sg[].page during merging. | ||
56 | */ | ||
57 | static void rds_message_purge(struct rds_message *rm) | ||
58 | { | ||
59 | unsigned long i; | ||
60 | |||
61 | if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) | ||
62 | return; | ||
63 | |||
64 | for (i = 0; i < rm->m_nents; i++) { | ||
65 | rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); | ||
66 | /* XXX will have to put_page for page refs */ | ||
67 | __free_page(sg_page(&rm->m_sg[i])); | ||
68 | } | ||
69 | rm->m_nents = 0; | ||
70 | |||
71 | if (rm->m_rdma_op) | ||
72 | rds_rdma_free_op(rm->m_rdma_op); | ||
73 | if (rm->m_rdma_mr) | ||
74 | rds_mr_put(rm->m_rdma_mr); | ||
75 | } | ||
76 | |||
77 | void rds_message_inc_purge(struct rds_incoming *inc) | ||
78 | { | ||
79 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | ||
80 | rds_message_purge(rm); | ||
81 | } | ||
82 | |||
83 | void rds_message_put(struct rds_message *rm) | ||
84 | { | ||
85 | rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); | ||
86 | |||
87 | if (atomic_dec_and_test(&rm->m_refcount)) { | ||
88 | BUG_ON(!list_empty(&rm->m_sock_item)); | ||
89 | BUG_ON(!list_empty(&rm->m_conn_item)); | ||
90 | rds_message_purge(rm); | ||
91 | |||
92 | kfree(rm); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | void rds_message_inc_free(struct rds_incoming *inc) | ||
97 | { | ||
98 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | ||
99 | rds_message_put(rm); | ||
100 | } | ||
101 | |||
102 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | ||
103 | __be16 dport, u64 seq) | ||
104 | { | ||
105 | hdr->h_flags = 0; | ||
106 | hdr->h_sport = sport; | ||
107 | hdr->h_dport = dport; | ||
108 | hdr->h_sequence = cpu_to_be64(seq); | ||
109 | hdr->h_exthdr[0] = RDS_EXTHDR_NONE; | ||
110 | } | ||
111 | |||
112 | int rds_message_add_extension(struct rds_header *hdr, | ||
113 | unsigned int type, const void *data, unsigned int len) | ||
114 | { | ||
115 | unsigned int ext_len = sizeof(u8) + len; | ||
116 | unsigned char *dst; | ||
117 | |||
118 | /* For now, refuse to add more than one extension header */ | ||
119 | if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) | ||
120 | return 0; | ||
121 | |||
122 | if (type >= __RDS_EXTHDR_MAX | ||
123 | || len != rds_exthdr_size[type]) | ||
124 | return 0; | ||
125 | |||
126 | if (ext_len >= RDS_HEADER_EXT_SPACE) | ||
127 | return 0; | ||
128 | dst = hdr->h_exthdr; | ||
129 | |||
130 | *dst++ = type; | ||
131 | memcpy(dst, data, len); | ||
132 | |||
133 | dst[len] = RDS_EXTHDR_NONE; | ||
134 | return 1; | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * If a message has extension headers, retrieve them here. | ||
139 | * Call like this: | ||
140 | * | ||
141 | * unsigned int pos = 0; | ||
142 | * | ||
143 | * while (1) { | ||
144 | * buflen = sizeof(buffer); | ||
145 | * type = rds_message_next_extension(hdr, &pos, buffer, &buflen); | ||
146 | * if (type == RDS_EXTHDR_NONE) | ||
147 | * break; | ||
148 | * ... | ||
149 | * } | ||
150 | */ | ||
151 | int rds_message_next_extension(struct rds_header *hdr, | ||
152 | unsigned int *pos, void *buf, unsigned int *buflen) | ||
153 | { | ||
154 | unsigned int offset, ext_type, ext_len; | ||
155 | u8 *src = hdr->h_exthdr; | ||
156 | |||
157 | offset = *pos; | ||
158 | if (offset >= RDS_HEADER_EXT_SPACE) | ||
159 | goto none; | ||
160 | |||
161 | /* Get the extension type and length. For now, the | ||
162 | * length is implied by the extension type. */ | ||
163 | ext_type = src[offset++]; | ||
164 | |||
165 | if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX) | ||
166 | goto none; | ||
167 | ext_len = rds_exthdr_size[ext_type]; | ||
168 | if (offset + ext_len > RDS_HEADER_EXT_SPACE) | ||
169 | goto none; | ||
170 | |||
171 | *pos = offset + ext_len; | ||
172 | if (ext_len < *buflen) | ||
173 | *buflen = ext_len; | ||
174 | memcpy(buf, src + offset, *buflen); | ||
175 | return ext_type; | ||
176 | |||
177 | none: | ||
178 | *pos = RDS_HEADER_EXT_SPACE; | ||
179 | *buflen = 0; | ||
180 | return RDS_EXTHDR_NONE; | ||
181 | } | ||
182 | |||
183 | int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) | ||
184 | { | ||
185 | struct rds_ext_header_version ext_hdr; | ||
186 | |||
187 | ext_hdr.h_version = cpu_to_be32(version); | ||
188 | return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); | ||
189 | } | ||
190 | |||
191 | int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) | ||
192 | { | ||
193 | struct rds_ext_header_version ext_hdr; | ||
194 | unsigned int pos = 0, len = sizeof(ext_hdr); | ||
195 | |||
196 | /* We assume the version extension is the only one present */ | ||
197 | if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) | ||
198 | return 0; | ||
199 | *version = be32_to_cpu(ext_hdr.h_version); | ||
200 | return 1; | ||
201 | } | ||
202 | |||
203 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) | ||
204 | { | ||
205 | struct rds_ext_header_rdma_dest ext_hdr; | ||
206 | |||
207 | ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); | ||
208 | ext_hdr.h_rdma_offset = cpu_to_be32(offset); | ||
209 | return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); | ||
210 | } | ||
211 | |||
212 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) | ||
213 | { | ||
214 | struct rds_message *rm; | ||
215 | |||
216 | rm = kzalloc(sizeof(struct rds_message) + | ||
217 | (nents * sizeof(struct scatterlist)), gfp); | ||
218 | if (!rm) | ||
219 | goto out; | ||
220 | |||
221 | if (nents) | ||
222 | sg_init_table(rm->m_sg, nents); | ||
223 | atomic_set(&rm->m_refcount, 1); | ||
224 | INIT_LIST_HEAD(&rm->m_sock_item); | ||
225 | INIT_LIST_HEAD(&rm->m_conn_item); | ||
226 | spin_lock_init(&rm->m_rs_lock); | ||
227 | |||
228 | out: | ||
229 | return rm; | ||
230 | } | ||
231 | |||
232 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) | ||
233 | { | ||
234 | struct rds_message *rm; | ||
235 | unsigned int i; | ||
236 | |||
237 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | ||
238 | if (rm == NULL) | ||
239 | return ERR_PTR(-ENOMEM); | ||
240 | |||
241 | set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); | ||
242 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | ||
243 | rm->m_nents = ceil(total_len, PAGE_SIZE); | ||
244 | |||
245 | for (i = 0; i < rm->m_nents; ++i) { | ||
246 | sg_set_page(&rm->m_sg[i], | ||
247 | virt_to_page(page_addrs[i]), | ||
248 | PAGE_SIZE, 0); | ||
249 | } | ||
250 | |||
251 | return rm; | ||
252 | } | ||
253 | |||
254 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | ||
255 | size_t total_len) | ||
256 | { | ||
257 | unsigned long to_copy; | ||
258 | unsigned long iov_off; | ||
259 | unsigned long sg_off; | ||
260 | struct rds_message *rm; | ||
261 | struct iovec *iov; | ||
262 | struct scatterlist *sg; | ||
263 | int ret; | ||
264 | |||
265 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | ||
266 | if (rm == NULL) { | ||
267 | ret = -ENOMEM; | ||
268 | goto out; | ||
269 | } | ||
270 | |||
271 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | ||
272 | |||
273 | /* | ||
274 | * now allocate and copy in the data payload. | ||
275 | */ | ||
276 | sg = rm->m_sg; | ||
277 | iov = first_iov; | ||
278 | iov_off = 0; | ||
279 | sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ | ||
280 | |||
281 | while (total_len) { | ||
282 | if (sg_page(sg) == NULL) { | ||
283 | ret = rds_page_remainder_alloc(sg, total_len, | ||
284 | GFP_HIGHUSER); | ||
285 | if (ret) | ||
286 | goto out; | ||
287 | rm->m_nents++; | ||
288 | sg_off = 0; | ||
289 | } | ||
290 | |||
291 | while (iov_off == iov->iov_len) { | ||
292 | iov_off = 0; | ||
293 | iov++; | ||
294 | } | ||
295 | |||
296 | to_copy = min(iov->iov_len - iov_off, sg->length - sg_off); | ||
297 | to_copy = min_t(size_t, to_copy, total_len); | ||
298 | |||
299 | rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to " | ||
300 | "sg [%p, %u, %u] + %lu\n", | ||
301 | to_copy, iov->iov_base, iov->iov_len, iov_off, | ||
302 | (void *)sg_page(sg), sg->offset, sg->length, sg_off); | ||
303 | |||
304 | ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off, | ||
305 | iov->iov_base + iov_off, | ||
306 | to_copy); | ||
307 | if (ret) | ||
308 | goto out; | ||
309 | |||
310 | iov_off += to_copy; | ||
311 | total_len -= to_copy; | ||
312 | sg_off += to_copy; | ||
313 | |||
314 | if (sg_off == sg->length) | ||
315 | sg++; | ||
316 | } | ||
317 | |||
318 | ret = 0; | ||
319 | out: | ||
320 | if (ret) { | ||
321 | if (rm) | ||
322 | rds_message_put(rm); | ||
323 | rm = ERR_PTR(ret); | ||
324 | } | ||
325 | return rm; | ||
326 | } | ||
327 | |||
328 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | ||
329 | struct iovec *first_iov, size_t size) | ||
330 | { | ||
331 | struct rds_message *rm; | ||
332 | struct iovec *iov; | ||
333 | struct scatterlist *sg; | ||
334 | unsigned long to_copy; | ||
335 | unsigned long iov_off; | ||
336 | unsigned long vec_off; | ||
337 | int copied; | ||
338 | int ret; | ||
339 | u32 len; | ||
340 | |||
341 | rm = container_of(inc, struct rds_message, m_inc); | ||
342 | len = be32_to_cpu(rm->m_inc.i_hdr.h_len); | ||
343 | |||
344 | iov = first_iov; | ||
345 | iov_off = 0; | ||
346 | sg = rm->m_sg; | ||
347 | vec_off = 0; | ||
348 | copied = 0; | ||
349 | |||
350 | while (copied < size && copied < len) { | ||
351 | while (iov_off == iov->iov_len) { | ||
352 | iov_off = 0; | ||
353 | iov++; | ||
354 | } | ||
355 | |||
356 | to_copy = min(iov->iov_len - iov_off, sg->length - vec_off); | ||
357 | to_copy = min_t(size_t, to_copy, size - copied); | ||
358 | to_copy = min_t(unsigned long, to_copy, len - copied); | ||
359 | |||
360 | rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to " | ||
361 | "sg [%p, %u, %u] + %lu\n", | ||
362 | to_copy, iov->iov_base, iov->iov_len, iov_off, | ||
363 | sg_page(sg), sg->offset, sg->length, vec_off); | ||
364 | |||
365 | ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off, | ||
366 | iov->iov_base + iov_off, | ||
367 | to_copy); | ||
368 | if (ret) { | ||
369 | copied = ret; | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | iov_off += to_copy; | ||
374 | vec_off += to_copy; | ||
375 | copied += to_copy; | ||
376 | |||
377 | if (vec_off == sg->length) { | ||
378 | vec_off = 0; | ||
379 | sg++; | ||
380 | } | ||
381 | } | ||
382 | |||
383 | return copied; | ||
384 | } | ||
385 | |||
386 | /* | ||
387 | * If the message is still on the send queue, wait until the transport | ||
388 | * is done with it. This is particularly important for RDMA operations. | ||
389 | */ | ||
390 | void rds_message_wait(struct rds_message *rm) | ||
391 | { | ||
392 | wait_event(rds_message_flush_waitq, | ||
393 | !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); | ||
394 | } | ||
395 | |||
396 | void rds_message_unmapped(struct rds_message *rm) | ||
397 | { | ||
398 | clear_bit(RDS_MSG_MAPPED, &rm->m_flags); | ||
399 | if (waitqueue_active(&rds_message_flush_waitq)) | ||
400 | wake_up(&rds_message_flush_waitq); | ||
401 | } | ||
402 | |||
diff --git a/net/rds/page.c b/net/rds/page.c new file mode 100644 index 000000000000..c460743a89ad --- /dev/null +++ b/net/rds/page.c | |||
@@ -0,0 +1,221 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/highmem.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | |||
37 | struct rds_page_remainder { | ||
38 | struct page *r_page; | ||
39 | unsigned long r_offset; | ||
40 | }; | ||
41 | |||
42 | DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; | ||
43 | |||
44 | /* | ||
45 | * returns 0 on success or -errno on failure. | ||
46 | * | ||
47 | * We don't have to worry about flush_dcache_page() as this only works | ||
48 | * with private pages. If, say, we were to do directed receive to pinned | ||
49 | * user pages we'd have to worry more about cache coherence. (Though | ||
50 | * the flush_dcache_page() in get_user_pages() would probably be enough). | ||
51 | */ | ||
52 | int rds_page_copy_user(struct page *page, unsigned long offset, | ||
53 | void __user *ptr, unsigned long bytes, | ||
54 | int to_user) | ||
55 | { | ||
56 | unsigned long ret; | ||
57 | void *addr; | ||
58 | |||
59 | if (to_user) | ||
60 | rds_stats_add(s_copy_to_user, bytes); | ||
61 | else | ||
62 | rds_stats_add(s_copy_from_user, bytes); | ||
63 | |||
64 | addr = kmap_atomic(page, KM_USER0); | ||
65 | if (to_user) | ||
66 | ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); | ||
67 | else | ||
68 | ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); | ||
69 | kunmap_atomic(addr, KM_USER0); | ||
70 | |||
71 | if (ret) { | ||
72 | addr = kmap(page); | ||
73 | if (to_user) | ||
74 | ret = copy_to_user(ptr, addr + offset, bytes); | ||
75 | else | ||
76 | ret = copy_from_user(addr + offset, ptr, bytes); | ||
77 | kunmap(page); | ||
78 | if (ret) | ||
79 | return -EFAULT; | ||
80 | } | ||
81 | |||
82 | return 0; | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * Message allocation uses this to build up regions of a message. | ||
87 | * | ||
88 | * @bytes - the number of bytes needed. | ||
89 | * @gfp - the waiting behaviour of the allocation | ||
90 | * | ||
91 | * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to | ||
92 | * kmap the pages, etc. | ||
93 | * | ||
94 | * If @bytes is at least a full page then this just returns a page from | ||
95 | * alloc_page(). | ||
96 | * | ||
97 | * If @bytes is a partial page then this stores the unused region of the | ||
98 | * page in a per-cpu structure. Future partial-page allocations may be | ||
99 | * satisfied from that cached region. This lets us waste less memory on | ||
100 | * small allocations with minimal complexity. It works because the transmit | ||
101 | * path passes read-only page regions down to devices. They hold a page | ||
102 | * reference until they are done with the region. | ||
103 | */ | ||
104 | int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | ||
105 | gfp_t gfp) | ||
106 | { | ||
107 | struct rds_page_remainder *rem; | ||
108 | unsigned long flags; | ||
109 | struct page *page; | ||
110 | int ret; | ||
111 | |||
112 | gfp |= __GFP_HIGHMEM; | ||
113 | |||
114 | /* jump straight to allocation if we're trying for a huge page */ | ||
115 | if (bytes >= PAGE_SIZE) { | ||
116 | page = alloc_page(gfp); | ||
117 | if (page == NULL) { | ||
118 | ret = -ENOMEM; | ||
119 | } else { | ||
120 | sg_set_page(scat, page, PAGE_SIZE, 0); | ||
121 | ret = 0; | ||
122 | } | ||
123 | goto out; | ||
124 | } | ||
125 | |||
126 | rem = &per_cpu(rds_page_remainders, get_cpu()); | ||
127 | local_irq_save(flags); | ||
128 | |||
129 | while (1) { | ||
130 | /* avoid a tiny region getting stuck by tossing it */ | ||
131 | if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { | ||
132 | rds_stats_inc(s_page_remainder_miss); | ||
133 | __free_page(rem->r_page); | ||
134 | rem->r_page = NULL; | ||
135 | } | ||
136 | |||
137 | /* hand out a fragment from the cached page */ | ||
138 | if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { | ||
139 | sg_set_page(scat, rem->r_page, bytes, rem->r_offset); | ||
140 | get_page(sg_page(scat)); | ||
141 | |||
142 | if (rem->r_offset != 0) | ||
143 | rds_stats_inc(s_page_remainder_hit); | ||
144 | |||
145 | rem->r_offset += bytes; | ||
146 | if (rem->r_offset == PAGE_SIZE) { | ||
147 | __free_page(rem->r_page); | ||
148 | rem->r_page = NULL; | ||
149 | } | ||
150 | ret = 0; | ||
151 | break; | ||
152 | } | ||
153 | |||
154 | /* alloc if there is nothing for us to use */ | ||
155 | local_irq_restore(flags); | ||
156 | put_cpu(); | ||
157 | |||
158 | page = alloc_page(gfp); | ||
159 | |||
160 | rem = &per_cpu(rds_page_remainders, get_cpu()); | ||
161 | local_irq_save(flags); | ||
162 | |||
163 | if (page == NULL) { | ||
164 | ret = -ENOMEM; | ||
165 | break; | ||
166 | } | ||
167 | |||
168 | /* did someone race to fill the remainder before us? */ | ||
169 | if (rem->r_page) { | ||
170 | __free_page(page); | ||
171 | continue; | ||
172 | } | ||
173 | |||
174 | /* otherwise install our page and loop around to alloc */ | ||
175 | rem->r_page = page; | ||
176 | rem->r_offset = 0; | ||
177 | } | ||
178 | |||
179 | local_irq_restore(flags); | ||
180 | put_cpu(); | ||
181 | out: | ||
182 | rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, | ||
183 | ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, | ||
184 | ret ? 0 : scat->length); | ||
185 | return ret; | ||
186 | } | ||
187 | |||
188 | static int rds_page_remainder_cpu_notify(struct notifier_block *self, | ||
189 | unsigned long action, void *hcpu) | ||
190 | { | ||
191 | struct rds_page_remainder *rem; | ||
192 | long cpu = (long)hcpu; | ||
193 | |||
194 | rem = &per_cpu(rds_page_remainders, cpu); | ||
195 | |||
196 | rdsdebug("cpu %ld action 0x%lx\n", cpu, action); | ||
197 | |||
198 | switch (action) { | ||
199 | case CPU_DEAD: | ||
200 | if (rem->r_page) | ||
201 | __free_page(rem->r_page); | ||
202 | rem->r_page = NULL; | ||
203 | break; | ||
204 | } | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | static struct notifier_block rds_page_remainder_nb = { | ||
210 | .notifier_call = rds_page_remainder_cpu_notify, | ||
211 | }; | ||
212 | |||
213 | void rds_page_exit(void) | ||
214 | { | ||
215 | int i; | ||
216 | |||
217 | for_each_possible_cpu(i) | ||
218 | rds_page_remainder_cpu_notify(&rds_page_remainder_nb, | ||
219 | (unsigned long)CPU_DEAD, | ||
220 | (void *)(long)i); | ||
221 | } | ||
diff --git a/net/rds/rdma.c b/net/rds/rdma.c new file mode 100644 index 000000000000..eaeeb91e1119 --- /dev/null +++ b/net/rds/rdma.c | |||
@@ -0,0 +1,679 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2007 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/pagemap.h> | ||
34 | #include <linux/rbtree.h> | ||
35 | #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ | ||
36 | |||
37 | #include "rdma.h" | ||
38 | |||
39 | /* | ||
40 | * XXX | ||
41 | * - build with sparse | ||
42 | * - should we limit the size of a mr region? let transport return failure? | ||
43 | * - should we detect duplicate keys on a socket? hmm. | ||
44 | * - an rdma is an mlock, apply rlimit? | ||
45 | */ | ||
46 | |||
47 | /* | ||
48 | * get the number of pages by looking at the page indices that the start and | ||
49 | * end addresses fall in. | ||
50 | * | ||
51 | * Returns 0 if the vec is invalid. It is invalid if the number of bytes | ||
52 | * causes the address to wrap or overflows an unsigned int. This comes | ||
53 | * from being stored in the 'length' member of 'struct scatterlist'. | ||
54 | */ | ||
55 | static unsigned int rds_pages_in_vec(struct rds_iovec *vec) | ||
56 | { | ||
57 | if ((vec->addr + vec->bytes <= vec->addr) || | ||
58 | (vec->bytes > (u64)UINT_MAX)) | ||
59 | return 0; | ||
60 | |||
61 | return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - | ||
62 | (vec->addr >> PAGE_SHIFT); | ||
63 | } | ||
64 | |||
65 | static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, | ||
66 | struct rds_mr *insert) | ||
67 | { | ||
68 | struct rb_node **p = &root->rb_node; | ||
69 | struct rb_node *parent = NULL; | ||
70 | struct rds_mr *mr; | ||
71 | |||
72 | while (*p) { | ||
73 | parent = *p; | ||
74 | mr = rb_entry(parent, struct rds_mr, r_rb_node); | ||
75 | |||
76 | if (key < mr->r_key) | ||
77 | p = &(*p)->rb_left; | ||
78 | else if (key > mr->r_key) | ||
79 | p = &(*p)->rb_right; | ||
80 | else | ||
81 | return mr; | ||
82 | } | ||
83 | |||
84 | if (insert) { | ||
85 | rb_link_node(&insert->r_rb_node, parent, p); | ||
86 | rb_insert_color(&insert->r_rb_node, root); | ||
87 | atomic_inc(&insert->r_refcount); | ||
88 | } | ||
89 | return NULL; | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * Destroy the transport-specific part of a MR. | ||
94 | */ | ||
95 | static void rds_destroy_mr(struct rds_mr *mr) | ||
96 | { | ||
97 | struct rds_sock *rs = mr->r_sock; | ||
98 | void *trans_private = NULL; | ||
99 | unsigned long flags; | ||
100 | |||
101 | rdsdebug("RDS: destroy mr key is %x refcnt %u\n", | ||
102 | mr->r_key, atomic_read(&mr->r_refcount)); | ||
103 | |||
104 | if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state)) | ||
105 | return; | ||
106 | |||
107 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
108 | if (!RB_EMPTY_NODE(&mr->r_rb_node)) | ||
109 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | ||
110 | trans_private = mr->r_trans_private; | ||
111 | mr->r_trans_private = NULL; | ||
112 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
113 | |||
114 | if (trans_private) | ||
115 | mr->r_trans->free_mr(trans_private, mr->r_invalidate); | ||
116 | } | ||
117 | |||
118 | void __rds_put_mr_final(struct rds_mr *mr) | ||
119 | { | ||
120 | rds_destroy_mr(mr); | ||
121 | kfree(mr); | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * By the time this is called we can't have any more ioctls called on | ||
126 | * the socket so we don't need to worry about racing with others. | ||
127 | */ | ||
128 | void rds_rdma_drop_keys(struct rds_sock *rs) | ||
129 | { | ||
130 | struct rds_mr *mr; | ||
131 | struct rb_node *node; | ||
132 | |||
133 | /* Release any MRs associated with this socket */ | ||
134 | while ((node = rb_first(&rs->rs_rdma_keys))) { | ||
135 | mr = container_of(node, struct rds_mr, r_rb_node); | ||
136 | if (mr->r_trans == rs->rs_transport) | ||
137 | mr->r_invalidate = 0; | ||
138 | rds_mr_put(mr); | ||
139 | } | ||
140 | |||
141 | if (rs->rs_transport && rs->rs_transport->flush_mrs) | ||
142 | rs->rs_transport->flush_mrs(); | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Helper function to pin user pages. | ||
147 | */ | ||
148 | static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, | ||
149 | struct page **pages, int write) | ||
150 | { | ||
151 | int ret; | ||
152 | |||
153 | down_read(¤t->mm->mmap_sem); | ||
154 | ret = get_user_pages(current, current->mm, user_addr, | ||
155 | nr_pages, write, 0, pages, NULL); | ||
156 | up_read(¤t->mm->mmap_sem); | ||
157 | |||
158 | if (0 <= ret && (unsigned) ret < nr_pages) { | ||
159 | while (ret--) | ||
160 | put_page(pages[ret]); | ||
161 | ret = -EFAULT; | ||
162 | } | ||
163 | |||
164 | return ret; | ||
165 | } | ||
166 | |||
167 | static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | ||
168 | u64 *cookie_ret, struct rds_mr **mr_ret) | ||
169 | { | ||
170 | struct rds_mr *mr = NULL, *found; | ||
171 | unsigned int nr_pages; | ||
172 | struct page **pages = NULL; | ||
173 | struct scatterlist *sg; | ||
174 | void *trans_private; | ||
175 | unsigned long flags; | ||
176 | rds_rdma_cookie_t cookie; | ||
177 | unsigned int nents; | ||
178 | long i; | ||
179 | int ret; | ||
180 | |||
181 | if (rs->rs_bound_addr == 0) { | ||
182 | ret = -ENOTCONN; /* XXX not a great errno */ | ||
183 | goto out; | ||
184 | } | ||
185 | |||
186 | if (rs->rs_transport->get_mr == NULL) { | ||
187 | ret = -EOPNOTSUPP; | ||
188 | goto out; | ||
189 | } | ||
190 | |||
191 | nr_pages = rds_pages_in_vec(&args->vec); | ||
192 | if (nr_pages == 0) { | ||
193 | ret = -EINVAL; | ||
194 | goto out; | ||
195 | } | ||
196 | |||
197 | rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", | ||
198 | args->vec.addr, args->vec.bytes, nr_pages); | ||
199 | |||
200 | /* XXX clamp nr_pages to limit the size of this alloc? */ | ||
201 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | ||
202 | if (pages == NULL) { | ||
203 | ret = -ENOMEM; | ||
204 | goto out; | ||
205 | } | ||
206 | |||
207 | mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); | ||
208 | if (mr == NULL) { | ||
209 | ret = -ENOMEM; | ||
210 | goto out; | ||
211 | } | ||
212 | |||
213 | atomic_set(&mr->r_refcount, 1); | ||
214 | RB_CLEAR_NODE(&mr->r_rb_node); | ||
215 | mr->r_trans = rs->rs_transport; | ||
216 | mr->r_sock = rs; | ||
217 | |||
218 | if (args->flags & RDS_RDMA_USE_ONCE) | ||
219 | mr->r_use_once = 1; | ||
220 | if (args->flags & RDS_RDMA_INVALIDATE) | ||
221 | mr->r_invalidate = 1; | ||
222 | if (args->flags & RDS_RDMA_READWRITE) | ||
223 | mr->r_write = 1; | ||
224 | |||
225 | /* | ||
226 | * Pin the pages that make up the user buffer and transfer the page | ||
227 | * pointers to the mr's sg array. We check to see if we've mapped | ||
228 | * the whole region after transferring the partial page references | ||
229 | * to the sg array so that we can have one page ref cleanup path. | ||
230 | * | ||
231 | * For now we have no flag that tells us whether the mapping is | ||
232 | * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to | ||
233 | * the zero page. | ||
234 | */ | ||
235 | ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); | ||
236 | if (ret < 0) | ||
237 | goto out; | ||
238 | |||
239 | nents = ret; | ||
240 | sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); | ||
241 | if (sg == NULL) { | ||
242 | ret = -ENOMEM; | ||
243 | goto out; | ||
244 | } | ||
245 | WARN_ON(!nents); | ||
246 | sg_init_table(sg, nents); | ||
247 | |||
248 | /* Stick all pages into the scatterlist */ | ||
249 | for (i = 0 ; i < nents; i++) | ||
250 | sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); | ||
251 | |||
252 | rdsdebug("RDS: trans_private nents is %u\n", nents); | ||
253 | |||
254 | /* Obtain a transport specific MR. If this succeeds, the | ||
255 | * s/g list is now owned by the MR. | ||
256 | * Note that dma_map() implies that pending writes are | ||
257 | * flushed to RAM, so no dma_sync is needed here. */ | ||
258 | trans_private = rs->rs_transport->get_mr(sg, nents, rs, | ||
259 | &mr->r_key); | ||
260 | |||
261 | if (IS_ERR(trans_private)) { | ||
262 | for (i = 0 ; i < nents; i++) | ||
263 | put_page(sg_page(&sg[i])); | ||
264 | kfree(sg); | ||
265 | ret = PTR_ERR(trans_private); | ||
266 | goto out; | ||
267 | } | ||
268 | |||
269 | mr->r_trans_private = trans_private; | ||
270 | |||
271 | rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", | ||
272 | mr->r_key, (void *)(unsigned long) args->cookie_addr); | ||
273 | |||
274 | /* The user may pass us an unaligned address, but we can only | ||
275 | * map page aligned regions. So we keep the offset, and build | ||
276 | * a 64bit cookie containing <R_Key, offset> and pass that | ||
277 | * around. */ | ||
278 | cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); | ||
279 | if (cookie_ret) | ||
280 | *cookie_ret = cookie; | ||
281 | |||
282 | if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) { | ||
283 | ret = -EFAULT; | ||
284 | goto out; | ||
285 | } | ||
286 | |||
287 | /* Inserting the new MR into the rbtree bumps its | ||
288 | * reference count. */ | ||
289 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
290 | found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); | ||
291 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
292 | |||
293 | BUG_ON(found && found != mr); | ||
294 | |||
295 | rdsdebug("RDS: get_mr key is %x\n", mr->r_key); | ||
296 | if (mr_ret) { | ||
297 | atomic_inc(&mr->r_refcount); | ||
298 | *mr_ret = mr; | ||
299 | } | ||
300 | |||
301 | ret = 0; | ||
302 | out: | ||
303 | kfree(pages); | ||
304 | if (mr) | ||
305 | rds_mr_put(mr); | ||
306 | return ret; | ||
307 | } | ||
308 | |||
309 | int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) | ||
310 | { | ||
311 | struct rds_get_mr_args args; | ||
312 | |||
313 | if (optlen != sizeof(struct rds_get_mr_args)) | ||
314 | return -EINVAL; | ||
315 | |||
316 | if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, | ||
317 | sizeof(struct rds_get_mr_args))) | ||
318 | return -EFAULT; | ||
319 | |||
320 | return __rds_rdma_map(rs, &args, NULL, NULL); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Free the MR indicated by the given R_Key | ||
325 | */ | ||
326 | int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) | ||
327 | { | ||
328 | struct rds_free_mr_args args; | ||
329 | struct rds_mr *mr; | ||
330 | unsigned long flags; | ||
331 | |||
332 | if (optlen != sizeof(struct rds_free_mr_args)) | ||
333 | return -EINVAL; | ||
334 | |||
335 | if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, | ||
336 | sizeof(struct rds_free_mr_args))) | ||
337 | return -EFAULT; | ||
338 | |||
339 | /* Special case - a null cookie means flush all unused MRs */ | ||
340 | if (args.cookie == 0) { | ||
341 | if (!rs->rs_transport || !rs->rs_transport->flush_mrs) | ||
342 | return -EINVAL; | ||
343 | rs->rs_transport->flush_mrs(); | ||
344 | return 0; | ||
345 | } | ||
346 | |||
347 | /* Look up the MR given its R_key and remove it from the rbtree | ||
348 | * so nobody else finds it. | ||
349 | * This should also prevent races with rds_rdma_unuse. | ||
350 | */ | ||
351 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
352 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); | ||
353 | if (mr) { | ||
354 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | ||
355 | RB_CLEAR_NODE(&mr->r_rb_node); | ||
356 | if (args.flags & RDS_RDMA_INVALIDATE) | ||
357 | mr->r_invalidate = 1; | ||
358 | } | ||
359 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
360 | |||
361 | if (!mr) | ||
362 | return -EINVAL; | ||
363 | |||
364 | /* | ||
365 | * call rds_destroy_mr() ourselves so that we're sure it's done by the time | ||
366 | * we return. If we let rds_mr_put() do it it might not happen until | ||
367 | * someone else drops their ref. | ||
368 | */ | ||
369 | rds_destroy_mr(mr); | ||
370 | rds_mr_put(mr); | ||
371 | return 0; | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * This is called when we receive an extension header that | ||
376 | * tells us this MR was used. It allows us to implement | ||
377 | * use_once semantics | ||
378 | */ | ||
379 | void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) | ||
380 | { | ||
381 | struct rds_mr *mr; | ||
382 | unsigned long flags; | ||
383 | int zot_me = 0; | ||
384 | |||
385 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
386 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | ||
387 | if (mr && (mr->r_use_once || force)) { | ||
388 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | ||
389 | RB_CLEAR_NODE(&mr->r_rb_node); | ||
390 | zot_me = 1; | ||
391 | } else if (mr) | ||
392 | atomic_inc(&mr->r_refcount); | ||
393 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
394 | |||
395 | /* May have to issue a dma_sync on this memory region. | ||
396 | * Note we could avoid this if the operation was a RDMA READ, | ||
397 | * but at this point we can't tell. */ | ||
398 | if (mr != NULL) { | ||
399 | if (mr->r_trans->sync_mr) | ||
400 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); | ||
401 | |||
402 | /* If the MR was marked as invalidate, this will | ||
403 | * trigger an async flush. */ | ||
404 | if (zot_me) | ||
405 | rds_destroy_mr(mr); | ||
406 | rds_mr_put(mr); | ||
407 | } | ||
408 | } | ||
409 | |||
410 | void rds_rdma_free_op(struct rds_rdma_op *ro) | ||
411 | { | ||
412 | unsigned int i; | ||
413 | |||
414 | for (i = 0; i < ro->r_nents; i++) { | ||
415 | struct page *page = sg_page(&ro->r_sg[i]); | ||
416 | |||
417 | /* Mark page dirty if it was possibly modified, which | ||
418 | * is the case for a RDMA_READ which copies from remote | ||
419 | * to local memory */ | ||
420 | if (!ro->r_write) | ||
421 | set_page_dirty(page); | ||
422 | put_page(page); | ||
423 | } | ||
424 | |||
425 | kfree(ro->r_notifier); | ||
426 | kfree(ro); | ||
427 | } | ||
428 | |||
429 | /* | ||
430 | * args is a pointer to an in-kernel copy in the sendmsg cmsg. | ||
431 | */ | ||
432 | static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | ||
433 | struct rds_rdma_args *args) | ||
434 | { | ||
435 | struct rds_iovec vec; | ||
436 | struct rds_rdma_op *op = NULL; | ||
437 | unsigned int nr_pages; | ||
438 | unsigned int max_pages; | ||
439 | unsigned int nr_bytes; | ||
440 | struct page **pages = NULL; | ||
441 | struct rds_iovec __user *local_vec; | ||
442 | struct scatterlist *sg; | ||
443 | unsigned int nr; | ||
444 | unsigned int i, j; | ||
445 | int ret; | ||
446 | |||
447 | |||
448 | if (rs->rs_bound_addr == 0) { | ||
449 | ret = -ENOTCONN; /* XXX not a great errno */ | ||
450 | goto out; | ||
451 | } | ||
452 | |||
453 | if (args->nr_local > (u64)UINT_MAX) { | ||
454 | ret = -EMSGSIZE; | ||
455 | goto out; | ||
456 | } | ||
457 | |||
458 | nr_pages = 0; | ||
459 | max_pages = 0; | ||
460 | |||
461 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
462 | |||
463 | /* figure out the number of pages in the vector */ | ||
464 | for (i = 0; i < args->nr_local; i++) { | ||
465 | if (copy_from_user(&vec, &local_vec[i], | ||
466 | sizeof(struct rds_iovec))) { | ||
467 | ret = -EFAULT; | ||
468 | goto out; | ||
469 | } | ||
470 | |||
471 | nr = rds_pages_in_vec(&vec); | ||
472 | if (nr == 0) { | ||
473 | ret = -EINVAL; | ||
474 | goto out; | ||
475 | } | ||
476 | |||
477 | max_pages = max(nr, max_pages); | ||
478 | nr_pages += nr; | ||
479 | } | ||
480 | |||
481 | pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); | ||
482 | if (pages == NULL) { | ||
483 | ret = -ENOMEM; | ||
484 | goto out; | ||
485 | } | ||
486 | |||
487 | op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); | ||
488 | if (op == NULL) { | ||
489 | ret = -ENOMEM; | ||
490 | goto out; | ||
491 | } | ||
492 | |||
493 | op->r_write = !!(args->flags & RDS_RDMA_READWRITE); | ||
494 | op->r_fence = !!(args->flags & RDS_RDMA_FENCE); | ||
495 | op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); | ||
496 | op->r_recverr = rs->rs_recverr; | ||
497 | WARN_ON(!nr_pages); | ||
498 | sg_init_table(op->r_sg, nr_pages); | ||
499 | |||
500 | if (op->r_notify || op->r_recverr) { | ||
501 | /* We allocate an uninitialized notifier here, because | ||
502 | * we don't want to do that in the completion handler. We | ||
503 | * would have to use GFP_ATOMIC there, and don't want to deal | ||
504 | * with failed allocations. | ||
505 | */ | ||
506 | op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); | ||
507 | if (!op->r_notifier) { | ||
508 | ret = -ENOMEM; | ||
509 | goto out; | ||
510 | } | ||
511 | op->r_notifier->n_user_token = args->user_token; | ||
512 | op->r_notifier->n_status = RDS_RDMA_SUCCESS; | ||
513 | } | ||
514 | |||
515 | /* The cookie contains the R_Key of the remote memory region, and | ||
516 | * optionally an offset into it. This is how we implement RDMA into | ||
517 | * unaligned memory. | ||
518 | * When setting up the RDMA, we need to add that offset to the | ||
519 | * destination address (which is really an offset into the MR) | ||
520 | * FIXME: We may want to move this into ib_rdma.c | ||
521 | */ | ||
522 | op->r_key = rds_rdma_cookie_key(args->cookie); | ||
523 | op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); | ||
524 | |||
525 | nr_bytes = 0; | ||
526 | |||
527 | rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", | ||
528 | (unsigned long long)args->nr_local, | ||
529 | (unsigned long long)args->remote_vec.addr, | ||
530 | op->r_key); | ||
531 | |||
532 | for (i = 0; i < args->nr_local; i++) { | ||
533 | if (copy_from_user(&vec, &local_vec[i], | ||
534 | sizeof(struct rds_iovec))) { | ||
535 | ret = -EFAULT; | ||
536 | goto out; | ||
537 | } | ||
538 | |||
539 | nr = rds_pages_in_vec(&vec); | ||
540 | if (nr == 0) { | ||
541 | ret = -EINVAL; | ||
542 | goto out; | ||
543 | } | ||
544 | |||
545 | rs->rs_user_addr = vec.addr; | ||
546 | rs->rs_user_bytes = vec.bytes; | ||
547 | |||
548 | /* did the user change the vec under us? */ | ||
549 | if (nr > max_pages || op->r_nents + nr > nr_pages) { | ||
550 | ret = -EINVAL; | ||
551 | goto out; | ||
552 | } | ||
553 | /* If it's a WRITE operation, we want to pin the pages for reading. | ||
554 | * If it's a READ operation, we need to pin the pages for writing. | ||
555 | */ | ||
556 | ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); | ||
557 | if (ret < 0) | ||
558 | goto out; | ||
559 | |||
560 | rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", | ||
561 | nr_bytes, nr, vec.bytes, vec.addr); | ||
562 | |||
563 | nr_bytes += vec.bytes; | ||
564 | |||
565 | for (j = 0; j < nr; j++) { | ||
566 | unsigned int offset = vec.addr & ~PAGE_MASK; | ||
567 | |||
568 | sg = &op->r_sg[op->r_nents + j]; | ||
569 | sg_set_page(sg, pages[j], | ||
570 | min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), | ||
571 | offset); | ||
572 | |||
573 | rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", | ||
574 | sg->offset, sg->length, vec.addr, vec.bytes); | ||
575 | |||
576 | vec.addr += sg->length; | ||
577 | vec.bytes -= sg->length; | ||
578 | } | ||
579 | |||
580 | op->r_nents += nr; | ||
581 | } | ||
582 | |||
583 | |||
584 | if (nr_bytes > args->remote_vec.bytes) { | ||
585 | rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", | ||
586 | nr_bytes, | ||
587 | (unsigned int) args->remote_vec.bytes); | ||
588 | ret = -EINVAL; | ||
589 | goto out; | ||
590 | } | ||
591 | op->r_bytes = nr_bytes; | ||
592 | |||
593 | ret = 0; | ||
594 | out: | ||
595 | kfree(pages); | ||
596 | if (ret) { | ||
597 | if (op) | ||
598 | rds_rdma_free_op(op); | ||
599 | op = ERR_PTR(ret); | ||
600 | } | ||
601 | return op; | ||
602 | } | ||
603 | |||
604 | /* | ||
605 | * The application asks for a RDMA transfer. | ||
606 | * Extract all arguments and set up the rdma_op | ||
607 | */ | ||
608 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
609 | struct cmsghdr *cmsg) | ||
610 | { | ||
611 | struct rds_rdma_op *op; | ||
612 | |||
613 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) | ||
614 | || rm->m_rdma_op != NULL) | ||
615 | return -EINVAL; | ||
616 | |||
617 | op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); | ||
618 | if (IS_ERR(op)) | ||
619 | return PTR_ERR(op); | ||
620 | rds_stats_inc(s_send_rdma); | ||
621 | rm->m_rdma_op = op; | ||
622 | return 0; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * The application wants us to pass an RDMA destination (aka MR) | ||
627 | * to the remote | ||
628 | */ | ||
629 | int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | ||
630 | struct cmsghdr *cmsg) | ||
631 | { | ||
632 | unsigned long flags; | ||
633 | struct rds_mr *mr; | ||
634 | u32 r_key; | ||
635 | int err = 0; | ||
636 | |||
637 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) | ||
638 | || rm->m_rdma_cookie != 0) | ||
639 | return -EINVAL; | ||
640 | |||
641 | memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); | ||
642 | |||
643 | /* We are reusing a previously mapped MR here. Most likely, the | ||
644 | * application has written to the buffer, so we need to explicitly | ||
645 | * flush those writes to RAM. Otherwise the HCA may not see them | ||
646 | * when doing a DMA from that buffer. | ||
647 | */ | ||
648 | r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); | ||
649 | |||
650 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
651 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | ||
652 | if (mr == NULL) | ||
653 | err = -EINVAL; /* invalid r_key */ | ||
654 | else | ||
655 | atomic_inc(&mr->r_refcount); | ||
656 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
657 | |||
658 | if (mr) { | ||
659 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); | ||
660 | rm->m_rdma_mr = mr; | ||
661 | } | ||
662 | return err; | ||
663 | } | ||
664 | |||
665 | /* | ||
666 | * The application passes us an address range it wants to enable RDMA | ||
667 | * to/from. We map the area, and save the <R_Key,offset> pair | ||
668 | * in rm->m_rdma_cookie. This causes it to be sent along to the peer | ||
669 | * in an extension header. | ||
670 | */ | ||
671 | int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | ||
672 | struct cmsghdr *cmsg) | ||
673 | { | ||
674 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) | ||
675 | || rm->m_rdma_cookie != 0) | ||
676 | return -EINVAL; | ||
677 | |||
678 | return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); | ||
679 | } | ||
diff --git a/net/rds/rdma.h b/net/rds/rdma.h new file mode 100644 index 000000000000..425512098b0b --- /dev/null +++ b/net/rds/rdma.h | |||
@@ -0,0 +1,84 @@ | |||
1 | #ifndef _RDS_RDMA_H | ||
2 | #define _RDS_RDMA_H | ||
3 | |||
4 | #include <linux/rbtree.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/scatterlist.h> | ||
7 | |||
8 | #include "rds.h" | ||
9 | |||
10 | struct rds_mr { | ||
11 | struct rb_node r_rb_node; | ||
12 | atomic_t r_refcount; | ||
13 | u32 r_key; | ||
14 | |||
15 | /* A copy of the creation flags */ | ||
16 | unsigned int r_use_once:1; | ||
17 | unsigned int r_invalidate:1; | ||
18 | unsigned int r_write:1; | ||
19 | |||
20 | /* This is for RDS_MR_DEAD. | ||
21 | * It would be nice & consistent to make this part of the above | ||
22 | * bit field here, but we need to use test_and_set_bit. | ||
23 | */ | ||
24 | unsigned long r_state; | ||
25 | struct rds_sock *r_sock; /* back pointer to the socket that owns us */ | ||
26 | struct rds_transport *r_trans; | ||
27 | void *r_trans_private; | ||
28 | }; | ||
29 | |||
30 | /* Flags for mr->r_state */ | ||
31 | #define RDS_MR_DEAD 0 | ||
32 | |||
33 | struct rds_rdma_op { | ||
34 | u32 r_key; | ||
35 | u64 r_remote_addr; | ||
36 | unsigned int r_write:1; | ||
37 | unsigned int r_fence:1; | ||
38 | unsigned int r_notify:1; | ||
39 | unsigned int r_recverr:1; | ||
40 | unsigned int r_mapped:1; | ||
41 | struct rds_notifier *r_notifier; | ||
42 | unsigned int r_bytes; | ||
43 | unsigned int r_nents; | ||
44 | unsigned int r_count; | ||
45 | struct scatterlist r_sg[0]; | ||
46 | }; | ||
47 | |||
48 | static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) | ||
49 | { | ||
50 | return r_key | (((u64) offset) << 32); | ||
51 | } | ||
52 | |||
53 | static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) | ||
54 | { | ||
55 | return cookie; | ||
56 | } | ||
57 | |||
58 | static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) | ||
59 | { | ||
60 | return cookie >> 32; | ||
61 | } | ||
62 | |||
63 | int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
64 | int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
65 | void rds_rdma_drop_keys(struct rds_sock *rs); | ||
66 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
67 | struct cmsghdr *cmsg); | ||
68 | int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | ||
69 | struct cmsghdr *cmsg); | ||
70 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
71 | struct cmsghdr *cmsg); | ||
72 | int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | ||
73 | struct cmsghdr *cmsg); | ||
74 | void rds_rdma_free_op(struct rds_rdma_op *ro); | ||
75 | void rds_rdma_send_complete(struct rds_message *rm, int); | ||
76 | |||
77 | extern void __rds_put_mr_final(struct rds_mr *mr); | ||
78 | static inline void rds_mr_put(struct rds_mr *mr) | ||
79 | { | ||
80 | if (atomic_dec_and_test(&mr->r_refcount)) | ||
81 | __rds_put_mr_final(mr); | ||
82 | } | ||
83 | |||
84 | #endif | ||
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c new file mode 100644 index 000000000000..7b19024f9706 --- /dev/null +++ b/net/rds/rdma_transport.c | |||
@@ -0,0 +1,214 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2009 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <rdma/rdma_cm.h> | ||
34 | |||
35 | #include "rdma_transport.h" | ||
36 | |||
37 | static struct rdma_cm_id *rds_iw_listen_id; | ||
38 | |||
39 | int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | ||
40 | struct rdma_cm_event *event) | ||
41 | { | ||
42 | /* this can be null in the listening path */ | ||
43 | struct rds_connection *conn = cm_id->context; | ||
44 | struct rds_transport *trans; | ||
45 | int ret = 0; | ||
46 | |||
47 | rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, | ||
48 | event->event); | ||
49 | |||
50 | if (cm_id->device->node_type == RDMA_NODE_RNIC) | ||
51 | trans = &rds_iw_transport; | ||
52 | else | ||
53 | trans = &rds_ib_transport; | ||
54 | |||
55 | /* Prevent shutdown from tearing down the connection | ||
56 | * while we're executing. */ | ||
57 | if (conn) { | ||
58 | mutex_lock(&conn->c_cm_lock); | ||
59 | |||
60 | /* If the connection is being shut down, bail out | ||
61 | * right away. We return 0 so cm_id doesn't get | ||
62 | * destroyed prematurely */ | ||
63 | if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) { | ||
64 | /* Reject incoming connections while we're tearing | ||
65 | * down an existing one. */ | ||
66 | if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) | ||
67 | ret = 1; | ||
68 | goto out; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | switch (event->event) { | ||
73 | case RDMA_CM_EVENT_CONNECT_REQUEST: | ||
74 | ret = trans->cm_handle_connect(cm_id, event); | ||
75 | break; | ||
76 | |||
77 | case RDMA_CM_EVENT_ADDR_RESOLVED: | ||
78 | /* XXX do we need to clean up if this fails? */ | ||
79 | ret = rdma_resolve_route(cm_id, | ||
80 | RDS_RDMA_RESOLVE_TIMEOUT_MS); | ||
81 | break; | ||
82 | |||
83 | case RDMA_CM_EVENT_ROUTE_RESOLVED: | ||
84 | /* XXX worry about racing with listen acceptance */ | ||
85 | ret = trans->cm_initiate_connect(cm_id); | ||
86 | break; | ||
87 | |||
88 | case RDMA_CM_EVENT_ESTABLISHED: | ||
89 | trans->cm_connect_complete(conn, event); | ||
90 | break; | ||
91 | |||
92 | case RDMA_CM_EVENT_ADDR_ERROR: | ||
93 | case RDMA_CM_EVENT_ROUTE_ERROR: | ||
94 | case RDMA_CM_EVENT_CONNECT_ERROR: | ||
95 | case RDMA_CM_EVENT_UNREACHABLE: | ||
96 | case RDMA_CM_EVENT_REJECTED: | ||
97 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | ||
98 | case RDMA_CM_EVENT_ADDR_CHANGE: | ||
99 | if (conn) | ||
100 | rds_conn_drop(conn); | ||
101 | break; | ||
102 | |||
103 | case RDMA_CM_EVENT_DISCONNECTED: | ||
104 | printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection " | ||
105 | "%pI4->%pI4\n", &conn->c_laddr, | ||
106 | &conn->c_faddr); | ||
107 | rds_conn_drop(conn); | ||
108 | break; | ||
109 | |||
110 | default: | ||
111 | /* things like device disconnect? */ | ||
112 | printk(KERN_ERR "unknown event %u\n", event->event); | ||
113 | BUG(); | ||
114 | break; | ||
115 | } | ||
116 | |||
117 | out: | ||
118 | if (conn) | ||
119 | mutex_unlock(&conn->c_cm_lock); | ||
120 | |||
121 | rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); | ||
122 | |||
123 | return ret; | ||
124 | } | ||
125 | |||
126 | static int __init rds_rdma_listen_init(void) | ||
127 | { | ||
128 | struct sockaddr_in sin; | ||
129 | struct rdma_cm_id *cm_id; | ||
130 | int ret; | ||
131 | |||
132 | cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); | ||
133 | if (IS_ERR(cm_id)) { | ||
134 | ret = PTR_ERR(cm_id); | ||
135 | printk(KERN_ERR "RDS/IW: failed to setup listener, " | ||
136 | "rdma_create_id() returned %d\n", ret); | ||
137 | goto out; | ||
138 | } | ||
139 | |||
140 | sin.sin_family = PF_INET, | ||
141 | sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); | ||
142 | sin.sin_port = (__force u16)htons(RDS_PORT); | ||
143 | |||
144 | /* | ||
145 | * XXX I bet this binds the cm_id to a device. If we want to support | ||
146 | * fail-over we'll have to take this into consideration. | ||
147 | */ | ||
148 | ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); | ||
149 | if (ret) { | ||
150 | printk(KERN_ERR "RDS/IW: failed to setup listener, " | ||
151 | "rdma_bind_addr() returned %d\n", ret); | ||
152 | goto out; | ||
153 | } | ||
154 | |||
155 | ret = rdma_listen(cm_id, 128); | ||
156 | if (ret) { | ||
157 | printk(KERN_ERR "RDS/IW: failed to setup listener, " | ||
158 | "rdma_listen() returned %d\n", ret); | ||
159 | goto out; | ||
160 | } | ||
161 | |||
162 | rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); | ||
163 | |||
164 | rds_iw_listen_id = cm_id; | ||
165 | cm_id = NULL; | ||
166 | out: | ||
167 | if (cm_id) | ||
168 | rdma_destroy_id(cm_id); | ||
169 | return ret; | ||
170 | } | ||
171 | |||
172 | static void rds_rdma_listen_stop(void) | ||
173 | { | ||
174 | if (rds_iw_listen_id) { | ||
175 | rdsdebug("cm %p\n", rds_iw_listen_id); | ||
176 | rdma_destroy_id(rds_iw_listen_id); | ||
177 | rds_iw_listen_id = NULL; | ||
178 | } | ||
179 | } | ||
180 | |||
181 | int __init rds_rdma_init(void) | ||
182 | { | ||
183 | int ret; | ||
184 | |||
185 | ret = rds_rdma_listen_init(); | ||
186 | if (ret) | ||
187 | goto out; | ||
188 | |||
189 | ret = rds_iw_init(); | ||
190 | if (ret) | ||
191 | goto err_iw_init; | ||
192 | |||
193 | ret = rds_ib_init(); | ||
194 | if (ret) | ||
195 | goto err_ib_init; | ||
196 | |||
197 | goto out; | ||
198 | |||
199 | err_ib_init: | ||
200 | rds_iw_exit(); | ||
201 | err_iw_init: | ||
202 | rds_rdma_listen_stop(); | ||
203 | out: | ||
204 | return ret; | ||
205 | } | ||
206 | |||
207 | void rds_rdma_exit(void) | ||
208 | { | ||
209 | /* stop listening first to ensure no new connections are attempted */ | ||
210 | rds_rdma_listen_stop(); | ||
211 | rds_ib_exit(); | ||
212 | rds_iw_exit(); | ||
213 | } | ||
214 | |||
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h new file mode 100644 index 000000000000..2f2c7d976c21 --- /dev/null +++ b/net/rds/rdma_transport.h | |||
@@ -0,0 +1,28 @@ | |||
1 | #ifndef _RDMA_TRANSPORT_H | ||
2 | #define _RDMA_TRANSPORT_H | ||
3 | |||
4 | #include <rdma/ib_verbs.h> | ||
5 | #include <rdma/rdma_cm.h> | ||
6 | #include "rds.h" | ||
7 | |||
8 | #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 | ||
9 | |||
10 | int rds_rdma_conn_connect(struct rds_connection *conn); | ||
11 | int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | ||
12 | struct rdma_cm_event *event); | ||
13 | |||
14 | /* from rdma_transport.c */ | ||
15 | int rds_rdma_init(void); | ||
16 | void rds_rdma_exit(void); | ||
17 | |||
18 | /* from ib.c */ | ||
19 | extern struct rds_transport rds_ib_transport; | ||
20 | int rds_ib_init(void); | ||
21 | void rds_ib_exit(void); | ||
22 | |||
23 | /* from iw.c */ | ||
24 | extern struct rds_transport rds_iw_transport; | ||
25 | int rds_iw_init(void); | ||
26 | void rds_iw_exit(void); | ||
27 | |||
28 | #endif | ||
diff --git a/net/rds/rds.h b/net/rds/rds.h new file mode 100644 index 000000000000..060400704979 --- /dev/null +++ b/net/rds/rds.h | |||
@@ -0,0 +1,686 @@ | |||
1 | #ifndef _RDS_RDS_H | ||
2 | #define _RDS_RDS_H | ||
3 | |||
4 | #include <net/sock.h> | ||
5 | #include <linux/scatterlist.h> | ||
6 | #include <linux/highmem.h> | ||
7 | #include <rdma/rdma_cm.h> | ||
8 | #include <linux/mutex.h> | ||
9 | #include <linux/rds.h> | ||
10 | |||
11 | #include "info.h" | ||
12 | |||
13 | /* | ||
14 | * RDS Network protocol version | ||
15 | */ | ||
16 | #define RDS_PROTOCOL_3_0 0x0300 | ||
17 | #define RDS_PROTOCOL_3_1 0x0301 | ||
18 | #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 | ||
19 | #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) | ||
20 | #define RDS_PROTOCOL_MINOR(v) ((v) & 255) | ||
21 | #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) | ||
22 | |||
23 | /* | ||
24 | * XXX randomly chosen, but at least seems to be unused: | ||
25 | * # 18464-18768 Unassigned | ||
26 | * We should do better. We want a reserved port to discourage unpriv'ed | ||
27 | * userspace from listening. | ||
28 | */ | ||
29 | #define RDS_PORT 18634 | ||
30 | |||
31 | #ifdef DEBUG | ||
32 | #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) | ||
33 | #else | ||
34 | /* sigh, pr_debug() causes unused variable warnings */ | ||
35 | static inline void __attribute__ ((format (printf, 1, 2))) | ||
36 | rdsdebug(char *fmt, ...) | ||
37 | { | ||
38 | } | ||
39 | #endif | ||
40 | |||
41 | /* XXX is there one of these somewhere? */ | ||
42 | #define ceil(x, y) \ | ||
43 | ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) | ||
44 | |||
45 | #define RDS_FRAG_SHIFT 12 | ||
46 | #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) | ||
47 | |||
48 | #define RDS_CONG_MAP_BYTES (65536 / 8) | ||
49 | #define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long)) | ||
50 | #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) | ||
51 | #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) | ||
52 | |||
53 | struct rds_cong_map { | ||
54 | struct rb_node m_rb_node; | ||
55 | __be32 m_addr; | ||
56 | wait_queue_head_t m_waitq; | ||
57 | struct list_head m_conn_list; | ||
58 | unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; | ||
59 | }; | ||
60 | |||
61 | |||
62 | /* | ||
63 | * This is how we will track the connection state: | ||
64 | * A connection is always in one of the following | ||
65 | * states. Updates to the state are atomic and imply | ||
66 | * a memory barrier. | ||
67 | */ | ||
68 | enum { | ||
69 | RDS_CONN_DOWN = 0, | ||
70 | RDS_CONN_CONNECTING, | ||
71 | RDS_CONN_DISCONNECTING, | ||
72 | RDS_CONN_UP, | ||
73 | RDS_CONN_ERROR, | ||
74 | }; | ||
75 | |||
76 | /* Bits for c_flags */ | ||
77 | #define RDS_LL_SEND_FULL 0 | ||
78 | #define RDS_RECONNECT_PENDING 1 | ||
79 | |||
80 | struct rds_connection { | ||
81 | struct hlist_node c_hash_node; | ||
82 | __be32 c_laddr; | ||
83 | __be32 c_faddr; | ||
84 | unsigned int c_loopback:1; | ||
85 | struct rds_connection *c_passive; | ||
86 | |||
87 | struct rds_cong_map *c_lcong; | ||
88 | struct rds_cong_map *c_fcong; | ||
89 | |||
90 | struct mutex c_send_lock; /* protect send ring */ | ||
91 | struct rds_message *c_xmit_rm; | ||
92 | unsigned long c_xmit_sg; | ||
93 | unsigned int c_xmit_hdr_off; | ||
94 | unsigned int c_xmit_data_off; | ||
95 | unsigned int c_xmit_rdma_sent; | ||
96 | |||
97 | spinlock_t c_lock; /* protect msg queues */ | ||
98 | u64 c_next_tx_seq; | ||
99 | struct list_head c_send_queue; | ||
100 | struct list_head c_retrans; | ||
101 | |||
102 | u64 c_next_rx_seq; | ||
103 | |||
104 | struct rds_transport *c_trans; | ||
105 | void *c_transport_data; | ||
106 | |||
107 | atomic_t c_state; | ||
108 | unsigned long c_flags; | ||
109 | unsigned long c_reconnect_jiffies; | ||
110 | struct delayed_work c_send_w; | ||
111 | struct delayed_work c_recv_w; | ||
112 | struct delayed_work c_conn_w; | ||
113 | struct work_struct c_down_w; | ||
114 | struct mutex c_cm_lock; /* protect conn state & cm */ | ||
115 | |||
116 | struct list_head c_map_item; | ||
117 | unsigned long c_map_queued; | ||
118 | unsigned long c_map_offset; | ||
119 | unsigned long c_map_bytes; | ||
120 | |||
121 | unsigned int c_unacked_packets; | ||
122 | unsigned int c_unacked_bytes; | ||
123 | |||
124 | /* Protocol version */ | ||
125 | unsigned int c_version; | ||
126 | }; | ||
127 | |||
128 | #define RDS_FLAG_CONG_BITMAP 0x01 | ||
129 | #define RDS_FLAG_ACK_REQUIRED 0x02 | ||
130 | #define RDS_FLAG_RETRANSMITTED 0x04 | ||
131 | #define RDS_MAX_ADV_CREDIT 127 | ||
132 | |||
133 | /* | ||
134 | * Maximum space available for extension headers. | ||
135 | */ | ||
136 | #define RDS_HEADER_EXT_SPACE 16 | ||
137 | |||
138 | struct rds_header { | ||
139 | __be64 h_sequence; | ||
140 | __be64 h_ack; | ||
141 | __be32 h_len; | ||
142 | __be16 h_sport; | ||
143 | __be16 h_dport; | ||
144 | u8 h_flags; | ||
145 | u8 h_credit; | ||
146 | u8 h_padding[4]; | ||
147 | __sum16 h_csum; | ||
148 | |||
149 | u8 h_exthdr[RDS_HEADER_EXT_SPACE]; | ||
150 | }; | ||
151 | |||
152 | /* | ||
153 | * Reserved - indicates end of extensions | ||
154 | */ | ||
155 | #define RDS_EXTHDR_NONE 0 | ||
156 | |||
157 | /* | ||
158 | * This extension header is included in the very | ||
159 | * first message that is sent on a new connection, | ||
160 | * and identifies the protocol level. This will help | ||
161 | * rolling updates if a future change requires breaking | ||
162 | * the protocol. | ||
163 | * NB: This is no longer true for IB, where we do a version | ||
164 | * negotiation during the connection setup phase (protocol | ||
165 | * version information is included in the RDMA CM private data). | ||
166 | */ | ||
167 | #define RDS_EXTHDR_VERSION 1 | ||
168 | struct rds_ext_header_version { | ||
169 | __be32 h_version; | ||
170 | }; | ||
171 | |||
172 | /* | ||
173 | * This extension header is included in the RDS message | ||
174 | * chasing an RDMA operation. | ||
175 | */ | ||
176 | #define RDS_EXTHDR_RDMA 2 | ||
177 | struct rds_ext_header_rdma { | ||
178 | __be32 h_rdma_rkey; | ||
179 | }; | ||
180 | |||
181 | /* | ||
182 | * This extension header tells the peer about the | ||
183 | * destination <R_Key,offset> of the requested RDMA | ||
184 | * operation. | ||
185 | */ | ||
186 | #define RDS_EXTHDR_RDMA_DEST 3 | ||
187 | struct rds_ext_header_rdma_dest { | ||
188 | __be32 h_rdma_rkey; | ||
189 | __be32 h_rdma_offset; | ||
190 | }; | ||
191 | |||
192 | #define __RDS_EXTHDR_MAX 16 /* for now */ | ||
193 | |||
194 | struct rds_incoming { | ||
195 | atomic_t i_refcount; | ||
196 | struct list_head i_item; | ||
197 | struct rds_connection *i_conn; | ||
198 | struct rds_header i_hdr; | ||
199 | unsigned long i_rx_jiffies; | ||
200 | __be32 i_saddr; | ||
201 | |||
202 | rds_rdma_cookie_t i_rdma_cookie; | ||
203 | }; | ||
204 | |||
205 | /* | ||
206 | * m_sock_item and m_conn_item are on lists that are serialized under | ||
207 | * conn->c_lock. m_sock_item has additional meaning in that once it is empty | ||
208 | * the message will not be put back on the retransmit list after being sent. | ||
209 | * messages that are canceled while being sent rely on this. | ||
210 | * | ||
211 | * m_inc is used by loopback so that it can pass an incoming message straight | ||
212 | * back up into the rx path. It embeds a wire header which is also used by | ||
213 | * the send path, which is kind of awkward. | ||
214 | * | ||
215 | * m_sock_item indicates the message's presence on a socket's send or receive | ||
216 | * queue. m_rs will point to that socket. | ||
217 | * | ||
218 | * m_daddr is used by cancellation to prune messages to a given destination. | ||
219 | * | ||
220 | * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock | ||
221 | * nesting. As paths iterate over messages on a sock, or conn, they must | ||
222 | * also lock the conn, or sock, to remove the message from those lists too. | ||
223 | * Testing the flag to determine if the message is still on the lists lets | ||
224 | * us avoid testing the list_head directly. That means each path can use | ||
225 | * the message's list_head to keep it on a local list while juggling locks | ||
226 | * without confusing the other path. | ||
227 | * | ||
228 | * m_ack_seq is an optional field set by transports who need a different | ||
229 | * sequence number range to invalidate. They can use this in a callback | ||
230 | * that they pass to rds_send_drop_acked() to see if each message has been | ||
231 | * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't | ||
232 | * had ack_seq set yet. | ||
233 | */ | ||
234 | #define RDS_MSG_ON_SOCK 1 | ||
235 | #define RDS_MSG_ON_CONN 2 | ||
236 | #define RDS_MSG_HAS_ACK_SEQ 3 | ||
237 | #define RDS_MSG_ACK_REQUIRED 4 | ||
238 | #define RDS_MSG_RETRANSMITTED 5 | ||
239 | #define RDS_MSG_MAPPED 6 | ||
240 | #define RDS_MSG_PAGEVEC 7 | ||
241 | |||
242 | struct rds_message { | ||
243 | atomic_t m_refcount; | ||
244 | struct list_head m_sock_item; | ||
245 | struct list_head m_conn_item; | ||
246 | struct rds_incoming m_inc; | ||
247 | u64 m_ack_seq; | ||
248 | __be32 m_daddr; | ||
249 | unsigned long m_flags; | ||
250 | |||
251 | /* Never access m_rs without holding m_rs_lock. | ||
252 | * Lock nesting is | ||
253 | * rm->m_rs_lock | ||
254 | * -> rs->rs_lock | ||
255 | */ | ||
256 | spinlock_t m_rs_lock; | ||
257 | struct rds_sock *m_rs; | ||
258 | struct rds_rdma_op *m_rdma_op; | ||
259 | rds_rdma_cookie_t m_rdma_cookie; | ||
260 | struct rds_mr *m_rdma_mr; | ||
261 | unsigned int m_nents; | ||
262 | unsigned int m_count; | ||
263 | struct scatterlist m_sg[0]; | ||
264 | }; | ||
265 | |||
266 | /* | ||
267 | * The RDS notifier is used (optionally) to tell the application about | ||
268 | * completed RDMA operations. Rather than keeping the whole rds message | ||
269 | * around on the queue, we allocate a small notifier that is put on the | ||
270 | * socket's notifier_list. Notifications are delivered to the application | ||
271 | * through control messages. | ||
272 | */ | ||
273 | struct rds_notifier { | ||
274 | struct list_head n_list; | ||
275 | uint64_t n_user_token; | ||
276 | int n_status; | ||
277 | }; | ||
278 | |||
279 | /** | ||
280 | * struct rds_transport - transport specific behavioural hooks | ||
281 | * | ||
282 | * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send | ||
283 | * part of a message. The caller serializes on the send_sem so this | ||
284 | * doesn't need to be reentrant for a given conn. The header must be | ||
285 | * sent before the data payload. .xmit must be prepared to send a | ||
286 | * message with no data payload. .xmit should return the number of | ||
287 | * bytes that were sent down the connection, including header bytes. | ||
288 | * Returning 0 tells the caller that it doesn't need to perform any | ||
289 | * additional work now. This is usually the case when the transport has | ||
290 | * filled the sending queue for its connection and will handle | ||
291 | * triggering the rds thread to continue the send when space becomes | ||
292 | * available. Returning -EAGAIN tells the caller to retry the send | ||
293 | * immediately. Returning -ENOMEM tells the caller to retry the send at | ||
294 | * some point in the future. | ||
295 | * | ||
296 | * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once | ||
297 | * it returns the connection can not call rds_recv_incoming(). | ||
298 | * This will only be called once after conn_connect returns | ||
299 | * non-zero success and will The caller serializes this with | ||
300 | * the send and connecting paths (xmit_* and conn_*). The | ||
301 | * transport is responsible for other serialization, including | ||
302 | * rds_recv_incoming(). This is called in process context but | ||
303 | * should try hard not to block. | ||
304 | * | ||
305 | * @xmit_cong_map: This asks the transport to send the local bitmap down the | ||
306 | * given connection. XXX get a better story about the bitmap | ||
307 | * flag and header. | ||
308 | */ | ||
309 | |||
310 | struct rds_transport { | ||
311 | char t_name[TRANSNAMSIZ]; | ||
312 | struct list_head t_item; | ||
313 | struct module *t_owner; | ||
314 | unsigned int t_prefer_loopback:1; | ||
315 | |||
316 | int (*laddr_check)(__be32 addr); | ||
317 | int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); | ||
318 | void (*conn_free)(void *data); | ||
319 | int (*conn_connect)(struct rds_connection *conn); | ||
320 | void (*conn_shutdown)(struct rds_connection *conn); | ||
321 | void (*xmit_prepare)(struct rds_connection *conn); | ||
322 | void (*xmit_complete)(struct rds_connection *conn); | ||
323 | int (*xmit)(struct rds_connection *conn, struct rds_message *rm, | ||
324 | unsigned int hdr_off, unsigned int sg, unsigned int off); | ||
325 | int (*xmit_cong_map)(struct rds_connection *conn, | ||
326 | struct rds_cong_map *map, unsigned long offset); | ||
327 | int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); | ||
328 | int (*recv)(struct rds_connection *conn); | ||
329 | int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, | ||
330 | size_t size); | ||
331 | void (*inc_purge)(struct rds_incoming *inc); | ||
332 | void (*inc_free)(struct rds_incoming *inc); | ||
333 | |||
334 | int (*cm_handle_connect)(struct rdma_cm_id *cm_id, | ||
335 | struct rdma_cm_event *event); | ||
336 | int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); | ||
337 | void (*cm_connect_complete)(struct rds_connection *conn, | ||
338 | struct rdma_cm_event *event); | ||
339 | |||
340 | unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, | ||
341 | unsigned int avail); | ||
342 | void (*exit)(void); | ||
343 | void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, | ||
344 | struct rds_sock *rs, u32 *key_ret); | ||
345 | void (*sync_mr)(void *trans_private, int direction); | ||
346 | void (*free_mr)(void *trans_private, int invalidate); | ||
347 | void (*flush_mrs)(void); | ||
348 | }; | ||
349 | |||
350 | struct rds_sock { | ||
351 | struct sock rs_sk; | ||
352 | |||
353 | u64 rs_user_addr; | ||
354 | u64 rs_user_bytes; | ||
355 | |||
356 | /* | ||
357 | * bound_addr used for both incoming and outgoing, no INADDR_ANY | ||
358 | * support. | ||
359 | */ | ||
360 | struct rb_node rs_bound_node; | ||
361 | __be32 rs_bound_addr; | ||
362 | __be32 rs_conn_addr; | ||
363 | __be16 rs_bound_port; | ||
364 | __be16 rs_conn_port; | ||
365 | |||
366 | /* | ||
367 | * This is only used to communicate the transport between bind and | ||
368 | * initiating connections. All other trans use is referenced through | ||
369 | * the connection. | ||
370 | */ | ||
371 | struct rds_transport *rs_transport; | ||
372 | |||
373 | /* | ||
374 | * rds_sendmsg caches the conn it used the last time around. | ||
375 | * This helps avoid costly lookups. | ||
376 | */ | ||
377 | struct rds_connection *rs_conn; | ||
378 | |||
379 | /* flag indicating we were congested or not */ | ||
380 | int rs_congested; | ||
381 | |||
382 | /* rs_lock protects all these adjacent members before the newline */ | ||
383 | spinlock_t rs_lock; | ||
384 | struct list_head rs_send_queue; | ||
385 | u32 rs_snd_bytes; | ||
386 | int rs_rcv_bytes; | ||
387 | struct list_head rs_notify_queue; /* currently used for failed RDMAs */ | ||
388 | |||
389 | /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask | ||
390 | * to decide whether the application should be woken up. | ||
391 | * If not set, we use rs_cong_track to find out whether a cong map | ||
392 | * update arrived. | ||
393 | */ | ||
394 | uint64_t rs_cong_mask; | ||
395 | uint64_t rs_cong_notify; | ||
396 | struct list_head rs_cong_list; | ||
397 | unsigned long rs_cong_track; | ||
398 | |||
399 | /* | ||
400 | * rs_recv_lock protects the receive queue, and is | ||
401 | * used to serialize with rds_release. | ||
402 | */ | ||
403 | rwlock_t rs_recv_lock; | ||
404 | struct list_head rs_recv_queue; | ||
405 | |||
406 | /* just for stats reporting */ | ||
407 | struct list_head rs_item; | ||
408 | |||
409 | /* these have their own lock */ | ||
410 | spinlock_t rs_rdma_lock; | ||
411 | struct rb_root rs_rdma_keys; | ||
412 | |||
413 | /* Socket options - in case there will be more */ | ||
414 | unsigned char rs_recverr, | ||
415 | rs_cong_monitor; | ||
416 | }; | ||
417 | |||
418 | static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) | ||
419 | { | ||
420 | return container_of(sk, struct rds_sock, rs_sk); | ||
421 | } | ||
422 | static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) | ||
423 | { | ||
424 | return &rs->rs_sk; | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value | ||
429 | * to account for overhead. We don't account for overhead, we just apply | ||
430 | * the number of payload bytes to the specified value. | ||
431 | */ | ||
432 | static inline int rds_sk_sndbuf(struct rds_sock *rs) | ||
433 | { | ||
434 | return rds_rs_to_sk(rs)->sk_sndbuf / 2; | ||
435 | } | ||
436 | static inline int rds_sk_rcvbuf(struct rds_sock *rs) | ||
437 | { | ||
438 | return rds_rs_to_sk(rs)->sk_rcvbuf / 2; | ||
439 | } | ||
440 | |||
441 | struct rds_statistics { | ||
442 | uint64_t s_conn_reset; | ||
443 | uint64_t s_recv_drop_bad_checksum; | ||
444 | uint64_t s_recv_drop_old_seq; | ||
445 | uint64_t s_recv_drop_no_sock; | ||
446 | uint64_t s_recv_drop_dead_sock; | ||
447 | uint64_t s_recv_deliver_raced; | ||
448 | uint64_t s_recv_delivered; | ||
449 | uint64_t s_recv_queued; | ||
450 | uint64_t s_recv_immediate_retry; | ||
451 | uint64_t s_recv_delayed_retry; | ||
452 | uint64_t s_recv_ack_required; | ||
453 | uint64_t s_recv_rdma_bytes; | ||
454 | uint64_t s_recv_ping; | ||
455 | uint64_t s_send_queue_empty; | ||
456 | uint64_t s_send_queue_full; | ||
457 | uint64_t s_send_sem_contention; | ||
458 | uint64_t s_send_sem_queue_raced; | ||
459 | uint64_t s_send_immediate_retry; | ||
460 | uint64_t s_send_delayed_retry; | ||
461 | uint64_t s_send_drop_acked; | ||
462 | uint64_t s_send_ack_required; | ||
463 | uint64_t s_send_queued; | ||
464 | uint64_t s_send_rdma; | ||
465 | uint64_t s_send_rdma_bytes; | ||
466 | uint64_t s_send_pong; | ||
467 | uint64_t s_page_remainder_hit; | ||
468 | uint64_t s_page_remainder_miss; | ||
469 | uint64_t s_copy_to_user; | ||
470 | uint64_t s_copy_from_user; | ||
471 | uint64_t s_cong_update_queued; | ||
472 | uint64_t s_cong_update_received; | ||
473 | uint64_t s_cong_send_error; | ||
474 | uint64_t s_cong_send_blocked; | ||
475 | }; | ||
476 | |||
477 | /* af_rds.c */ | ||
478 | void rds_sock_addref(struct rds_sock *rs); | ||
479 | void rds_sock_put(struct rds_sock *rs); | ||
480 | void rds_wake_sk_sleep(struct rds_sock *rs); | ||
481 | static inline void __rds_wake_sk_sleep(struct sock *sk) | ||
482 | { | ||
483 | wait_queue_head_t *waitq = sk->sk_sleep; | ||
484 | |||
485 | if (!sock_flag(sk, SOCK_DEAD) && waitq) | ||
486 | wake_up(waitq); | ||
487 | } | ||
488 | extern wait_queue_head_t rds_poll_waitq; | ||
489 | |||
490 | |||
491 | /* bind.c */ | ||
492 | int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); | ||
493 | void rds_remove_bound(struct rds_sock *rs); | ||
494 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port); | ||
495 | |||
496 | /* cong.c */ | ||
497 | int rds_cong_get_maps(struct rds_connection *conn); | ||
498 | void rds_cong_add_conn(struct rds_connection *conn); | ||
499 | void rds_cong_remove_conn(struct rds_connection *conn); | ||
500 | void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); | ||
501 | void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); | ||
502 | int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); | ||
503 | void rds_cong_queue_updates(struct rds_cong_map *map); | ||
504 | void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); | ||
505 | int rds_cong_updated_since(unsigned long *recent); | ||
506 | void rds_cong_add_socket(struct rds_sock *); | ||
507 | void rds_cong_remove_socket(struct rds_sock *); | ||
508 | void rds_cong_exit(void); | ||
509 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); | ||
510 | |||
511 | /* conn.c */ | ||
512 | int __init rds_conn_init(void); | ||
513 | void rds_conn_exit(void); | ||
514 | struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, | ||
515 | struct rds_transport *trans, gfp_t gfp); | ||
516 | struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, | ||
517 | struct rds_transport *trans, gfp_t gfp); | ||
518 | void rds_conn_destroy(struct rds_connection *conn); | ||
519 | void rds_conn_reset(struct rds_connection *conn); | ||
520 | void rds_conn_drop(struct rds_connection *conn); | ||
521 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, | ||
522 | struct rds_info_iterator *iter, | ||
523 | struct rds_info_lengths *lens, | ||
524 | int (*visitor)(struct rds_connection *, void *), | ||
525 | size_t item_len); | ||
526 | void __rds_conn_error(struct rds_connection *conn, const char *, ...) | ||
527 | __attribute__ ((format (printf, 2, 3))); | ||
528 | #define rds_conn_error(conn, fmt...) \ | ||
529 | __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) | ||
530 | |||
531 | static inline int | ||
532 | rds_conn_transition(struct rds_connection *conn, int old, int new) | ||
533 | { | ||
534 | return atomic_cmpxchg(&conn->c_state, old, new) == old; | ||
535 | } | ||
536 | |||
537 | static inline int | ||
538 | rds_conn_state(struct rds_connection *conn) | ||
539 | { | ||
540 | return atomic_read(&conn->c_state); | ||
541 | } | ||
542 | |||
543 | static inline int | ||
544 | rds_conn_up(struct rds_connection *conn) | ||
545 | { | ||
546 | return atomic_read(&conn->c_state) == RDS_CONN_UP; | ||
547 | } | ||
548 | |||
549 | static inline int | ||
550 | rds_conn_connecting(struct rds_connection *conn) | ||
551 | { | ||
552 | return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; | ||
553 | } | ||
554 | |||
555 | /* message.c */ | ||
556 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); | ||
557 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | ||
558 | size_t total_len); | ||
559 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); | ||
560 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | ||
561 | __be16 dport, u64 seq); | ||
562 | int rds_message_add_extension(struct rds_header *hdr, | ||
563 | unsigned int type, const void *data, unsigned int len); | ||
564 | int rds_message_next_extension(struct rds_header *hdr, | ||
565 | unsigned int *pos, void *buf, unsigned int *buflen); | ||
566 | int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); | ||
567 | int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); | ||
568 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); | ||
569 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | ||
570 | struct iovec *first_iov, size_t size); | ||
571 | void rds_message_inc_purge(struct rds_incoming *inc); | ||
572 | void rds_message_inc_free(struct rds_incoming *inc); | ||
573 | void rds_message_addref(struct rds_message *rm); | ||
574 | void rds_message_put(struct rds_message *rm); | ||
575 | void rds_message_wait(struct rds_message *rm); | ||
576 | void rds_message_unmapped(struct rds_message *rm); | ||
577 | |||
578 | static inline void rds_message_make_checksum(struct rds_header *hdr) | ||
579 | { | ||
580 | hdr->h_csum = 0; | ||
581 | hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); | ||
582 | } | ||
583 | |||
584 | static inline int rds_message_verify_checksum(const struct rds_header *hdr) | ||
585 | { | ||
586 | return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; | ||
587 | } | ||
588 | |||
589 | |||
590 | /* page.c */ | ||
591 | int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | ||
592 | gfp_t gfp); | ||
593 | int rds_page_copy_user(struct page *page, unsigned long offset, | ||
594 | void __user *ptr, unsigned long bytes, | ||
595 | int to_user); | ||
596 | #define rds_page_copy_to_user(page, offset, ptr, bytes) \ | ||
597 | rds_page_copy_user(page, offset, ptr, bytes, 1) | ||
598 | #define rds_page_copy_from_user(page, offset, ptr, bytes) \ | ||
599 | rds_page_copy_user(page, offset, ptr, bytes, 0) | ||
600 | void rds_page_exit(void); | ||
601 | |||
602 | /* recv.c */ | ||
603 | void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, | ||
604 | __be32 saddr); | ||
605 | void rds_inc_addref(struct rds_incoming *inc); | ||
606 | void rds_inc_put(struct rds_incoming *inc); | ||
607 | void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, | ||
608 | struct rds_incoming *inc, gfp_t gfp, enum km_type km); | ||
609 | int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | ||
610 | size_t size, int msg_flags); | ||
611 | void rds_clear_recv_queue(struct rds_sock *rs); | ||
612 | int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); | ||
613 | void rds_inc_info_copy(struct rds_incoming *inc, | ||
614 | struct rds_info_iterator *iter, | ||
615 | __be32 saddr, __be32 daddr, int flip); | ||
616 | |||
617 | /* send.c */ | ||
618 | int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | ||
619 | size_t payload_len); | ||
620 | void rds_send_reset(struct rds_connection *conn); | ||
621 | int rds_send_xmit(struct rds_connection *conn); | ||
622 | struct sockaddr_in; | ||
623 | void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); | ||
624 | typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); | ||
625 | void rds_send_drop_acked(struct rds_connection *conn, u64 ack, | ||
626 | is_acked_func is_acked); | ||
627 | int rds_send_acked_before(struct rds_connection *conn, u64 seq); | ||
628 | void rds_send_remove_from_sock(struct list_head *messages, int status); | ||
629 | int rds_send_pong(struct rds_connection *conn, __be16 dport); | ||
630 | struct rds_message *rds_send_get_message(struct rds_connection *, | ||
631 | struct rds_rdma_op *); | ||
632 | |||
633 | /* rdma.c */ | ||
634 | void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); | ||
635 | |||
636 | /* stats.c */ | ||
637 | DECLARE_PER_CPU(struct rds_statistics, rds_stats); | ||
638 | #define rds_stats_inc_which(which, member) do { \ | ||
639 | per_cpu(which, get_cpu()).member++; \ | ||
640 | put_cpu(); \ | ||
641 | } while (0) | ||
642 | #define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) | ||
643 | #define rds_stats_add_which(which, member, count) do { \ | ||
644 | per_cpu(which, get_cpu()).member += count; \ | ||
645 | put_cpu(); \ | ||
646 | } while (0) | ||
647 | #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) | ||
648 | int __init rds_stats_init(void); | ||
649 | void rds_stats_exit(void); | ||
650 | void rds_stats_info_copy(struct rds_info_iterator *iter, | ||
651 | uint64_t *values, char **names, size_t nr); | ||
652 | |||
653 | /* sysctl.c */ | ||
654 | int __init rds_sysctl_init(void); | ||
655 | void rds_sysctl_exit(void); | ||
656 | extern unsigned long rds_sysctl_sndbuf_min; | ||
657 | extern unsigned long rds_sysctl_sndbuf_default; | ||
658 | extern unsigned long rds_sysctl_sndbuf_max; | ||
659 | extern unsigned long rds_sysctl_reconnect_min_jiffies; | ||
660 | extern unsigned long rds_sysctl_reconnect_max_jiffies; | ||
661 | extern unsigned int rds_sysctl_max_unacked_packets; | ||
662 | extern unsigned int rds_sysctl_max_unacked_bytes; | ||
663 | extern unsigned int rds_sysctl_ping_enable; | ||
664 | extern unsigned long rds_sysctl_trace_flags; | ||
665 | extern unsigned int rds_sysctl_trace_level; | ||
666 | |||
667 | /* threads.c */ | ||
668 | int __init rds_threads_init(void); | ||
669 | void rds_threads_exit(void); | ||
670 | extern struct workqueue_struct *rds_wq; | ||
671 | void rds_connect_worker(struct work_struct *); | ||
672 | void rds_shutdown_worker(struct work_struct *); | ||
673 | void rds_send_worker(struct work_struct *); | ||
674 | void rds_recv_worker(struct work_struct *); | ||
675 | void rds_connect_complete(struct rds_connection *conn); | ||
676 | |||
677 | /* transport.c */ | ||
678 | int rds_trans_register(struct rds_transport *trans); | ||
679 | void rds_trans_unregister(struct rds_transport *trans); | ||
680 | struct rds_transport *rds_trans_get_preferred(__be32 addr); | ||
681 | unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, | ||
682 | unsigned int avail); | ||
683 | int __init rds_trans_init(void); | ||
684 | void rds_trans_exit(void); | ||
685 | |||
686 | #endif | ||
diff --git a/net/rds/recv.c b/net/rds/recv.c new file mode 100644 index 000000000000..f2118c51cfa3 --- /dev/null +++ b/net/rds/recv.c | |||
@@ -0,0 +1,542 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <net/sock.h> | ||
35 | #include <linux/in.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "rdma.h" | ||
39 | |||
40 | void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, | ||
41 | __be32 saddr) | ||
42 | { | ||
43 | atomic_set(&inc->i_refcount, 1); | ||
44 | INIT_LIST_HEAD(&inc->i_item); | ||
45 | inc->i_conn = conn; | ||
46 | inc->i_saddr = saddr; | ||
47 | inc->i_rdma_cookie = 0; | ||
48 | } | ||
49 | |||
50 | void rds_inc_addref(struct rds_incoming *inc) | ||
51 | { | ||
52 | rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); | ||
53 | atomic_inc(&inc->i_refcount); | ||
54 | } | ||
55 | |||
56 | void rds_inc_put(struct rds_incoming *inc) | ||
57 | { | ||
58 | rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); | ||
59 | if (atomic_dec_and_test(&inc->i_refcount)) { | ||
60 | BUG_ON(!list_empty(&inc->i_item)); | ||
61 | |||
62 | inc->i_conn->c_trans->inc_free(inc); | ||
63 | } | ||
64 | } | ||
65 | |||
66 | static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, | ||
67 | struct rds_cong_map *map, | ||
68 | int delta, __be16 port) | ||
69 | { | ||
70 | int now_congested; | ||
71 | |||
72 | if (delta == 0) | ||
73 | return; | ||
74 | |||
75 | rs->rs_rcv_bytes += delta; | ||
76 | now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); | ||
77 | |||
78 | rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " | ||
79 | "now_cong %d delta %d\n", | ||
80 | rs, &rs->rs_bound_addr, | ||
81 | ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, | ||
82 | rds_sk_rcvbuf(rs), now_congested, delta); | ||
83 | |||
84 | /* wasn't -> am congested */ | ||
85 | if (!rs->rs_congested && now_congested) { | ||
86 | rs->rs_congested = 1; | ||
87 | rds_cong_set_bit(map, port); | ||
88 | rds_cong_queue_updates(map); | ||
89 | } | ||
90 | /* was -> aren't congested */ | ||
91 | /* Require more free space before reporting uncongested to prevent | ||
92 | bouncing cong/uncong state too often */ | ||
93 | else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { | ||
94 | rs->rs_congested = 0; | ||
95 | rds_cong_clear_bit(map, port); | ||
96 | rds_cong_queue_updates(map); | ||
97 | } | ||
98 | |||
99 | /* do nothing if no change in cong state */ | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * Process all extension headers that come with this message. | ||
104 | */ | ||
105 | static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) | ||
106 | { | ||
107 | struct rds_header *hdr = &inc->i_hdr; | ||
108 | unsigned int pos = 0, type, len; | ||
109 | union { | ||
110 | struct rds_ext_header_version version; | ||
111 | struct rds_ext_header_rdma rdma; | ||
112 | struct rds_ext_header_rdma_dest rdma_dest; | ||
113 | } buffer; | ||
114 | |||
115 | while (1) { | ||
116 | len = sizeof(buffer); | ||
117 | type = rds_message_next_extension(hdr, &pos, &buffer, &len); | ||
118 | if (type == RDS_EXTHDR_NONE) | ||
119 | break; | ||
120 | /* Process extension header here */ | ||
121 | switch (type) { | ||
122 | case RDS_EXTHDR_RDMA: | ||
123 | rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); | ||
124 | break; | ||
125 | |||
126 | case RDS_EXTHDR_RDMA_DEST: | ||
127 | /* We ignore the size for now. We could stash it | ||
128 | * somewhere and use it for error checking. */ | ||
129 | inc->i_rdma_cookie = rds_rdma_make_cookie( | ||
130 | be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), | ||
131 | be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); | ||
132 | |||
133 | break; | ||
134 | } | ||
135 | } | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * The transport must make sure that this is serialized against other | ||
140 | * rx and conn reset on this specific conn. | ||
141 | * | ||
142 | * We currently assert that only one fragmented message will be sent | ||
143 | * down a connection at a time. This lets us reassemble in the conn | ||
144 | * instead of per-flow which means that we don't have to go digging through | ||
145 | * flows to tear down partial reassembly progress on conn failure and | ||
146 | * we save flow lookup and locking for each frag arrival. It does mean | ||
147 | * that small messages will wait behind large ones. Fragmenting at all | ||
148 | * is only to reduce the memory consumption of pre-posted buffers. | ||
149 | * | ||
150 | * The caller passes in saddr and daddr instead of us getting it from the | ||
151 | * conn. This lets loopback, who only has one conn for both directions, | ||
152 | * tell us which roles the addrs in the conn are playing for this message. | ||
153 | */ | ||
154 | void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, | ||
155 | struct rds_incoming *inc, gfp_t gfp, enum km_type km) | ||
156 | { | ||
157 | struct rds_sock *rs = NULL; | ||
158 | struct sock *sk; | ||
159 | unsigned long flags; | ||
160 | |||
161 | inc->i_conn = conn; | ||
162 | inc->i_rx_jiffies = jiffies; | ||
163 | |||
164 | rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " | ||
165 | "flags 0x%x rx_jiffies %lu\n", conn, | ||
166 | (unsigned long long)conn->c_next_rx_seq, | ||
167 | inc, | ||
168 | (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), | ||
169 | be32_to_cpu(inc->i_hdr.h_len), | ||
170 | be16_to_cpu(inc->i_hdr.h_sport), | ||
171 | be16_to_cpu(inc->i_hdr.h_dport), | ||
172 | inc->i_hdr.h_flags, | ||
173 | inc->i_rx_jiffies); | ||
174 | |||
175 | /* | ||
176 | * Sequence numbers should only increase. Messages get their | ||
177 | * sequence number as they're queued in a sending conn. They | ||
178 | * can be dropped, though, if the sending socket is closed before | ||
179 | * they hit the wire. So sequence numbers can skip forward | ||
180 | * under normal operation. They can also drop back in the conn | ||
181 | * failover case as previously sent messages are resent down the | ||
182 | * new instance of a conn. We drop those, otherwise we have | ||
183 | * to assume that the next valid seq does not come after a | ||
184 | * hole in the fragment stream. | ||
185 | * | ||
186 | * The headers don't give us a way to realize if fragments of | ||
187 | * a message have been dropped. We assume that frags that arrive | ||
188 | * to a flow are part of the current message on the flow that is | ||
189 | * being reassembled. This means that senders can't drop messages | ||
190 | * from the sending conn until all their frags are sent. | ||
191 | * | ||
192 | * XXX we could spend more on the wire to get more robust failure | ||
193 | * detection, arguably worth it to avoid data corruption. | ||
194 | */ | ||
195 | if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq | ||
196 | && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { | ||
197 | rds_stats_inc(s_recv_drop_old_seq); | ||
198 | goto out; | ||
199 | } | ||
200 | conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; | ||
201 | |||
202 | if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { | ||
203 | rds_stats_inc(s_recv_ping); | ||
204 | rds_send_pong(conn, inc->i_hdr.h_sport); | ||
205 | goto out; | ||
206 | } | ||
207 | |||
208 | rs = rds_find_bound(daddr, inc->i_hdr.h_dport); | ||
209 | if (rs == NULL) { | ||
210 | rds_stats_inc(s_recv_drop_no_sock); | ||
211 | goto out; | ||
212 | } | ||
213 | |||
214 | /* Process extension headers */ | ||
215 | rds_recv_incoming_exthdrs(inc, rs); | ||
216 | |||
217 | /* We can be racing with rds_release() which marks the socket dead. */ | ||
218 | sk = rds_rs_to_sk(rs); | ||
219 | |||
220 | /* serialize with rds_release -> sock_orphan */ | ||
221 | write_lock_irqsave(&rs->rs_recv_lock, flags); | ||
222 | if (!sock_flag(sk, SOCK_DEAD)) { | ||
223 | rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); | ||
224 | rds_stats_inc(s_recv_queued); | ||
225 | rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, | ||
226 | be32_to_cpu(inc->i_hdr.h_len), | ||
227 | inc->i_hdr.h_dport); | ||
228 | rds_inc_addref(inc); | ||
229 | list_add_tail(&inc->i_item, &rs->rs_recv_queue); | ||
230 | __rds_wake_sk_sleep(sk); | ||
231 | } else { | ||
232 | rds_stats_inc(s_recv_drop_dead_sock); | ||
233 | } | ||
234 | write_unlock_irqrestore(&rs->rs_recv_lock, flags); | ||
235 | |||
236 | out: | ||
237 | if (rs) | ||
238 | rds_sock_put(rs); | ||
239 | } | ||
240 | |||
241 | /* | ||
242 | * be very careful here. This is being called as the condition in | ||
243 | * wait_event_*() needs to cope with being called many times. | ||
244 | */ | ||
245 | static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) | ||
246 | { | ||
247 | unsigned long flags; | ||
248 | |||
249 | if (*inc == NULL) { | ||
250 | read_lock_irqsave(&rs->rs_recv_lock, flags); | ||
251 | if (!list_empty(&rs->rs_recv_queue)) { | ||
252 | *inc = list_entry(rs->rs_recv_queue.next, | ||
253 | struct rds_incoming, | ||
254 | i_item); | ||
255 | rds_inc_addref(*inc); | ||
256 | } | ||
257 | read_unlock_irqrestore(&rs->rs_recv_lock, flags); | ||
258 | } | ||
259 | |||
260 | return *inc != NULL; | ||
261 | } | ||
262 | |||
263 | static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, | ||
264 | int drop) | ||
265 | { | ||
266 | struct sock *sk = rds_rs_to_sk(rs); | ||
267 | int ret = 0; | ||
268 | unsigned long flags; | ||
269 | |||
270 | write_lock_irqsave(&rs->rs_recv_lock, flags); | ||
271 | if (!list_empty(&inc->i_item)) { | ||
272 | ret = 1; | ||
273 | if (drop) { | ||
274 | /* XXX make sure this i_conn is reliable */ | ||
275 | rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, | ||
276 | -be32_to_cpu(inc->i_hdr.h_len), | ||
277 | inc->i_hdr.h_dport); | ||
278 | list_del_init(&inc->i_item); | ||
279 | rds_inc_put(inc); | ||
280 | } | ||
281 | } | ||
282 | write_unlock_irqrestore(&rs->rs_recv_lock, flags); | ||
283 | |||
284 | rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * Pull errors off the error queue. | ||
290 | * If msghdr is NULL, we will just purge the error queue. | ||
291 | */ | ||
292 | int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) | ||
293 | { | ||
294 | struct rds_notifier *notifier; | ||
295 | struct rds_rdma_notify cmsg; | ||
296 | unsigned int count = 0, max_messages = ~0U; | ||
297 | unsigned long flags; | ||
298 | LIST_HEAD(copy); | ||
299 | int err = 0; | ||
300 | |||
301 | |||
302 | /* put_cmsg copies to user space and thus may sleep. We can't do this | ||
303 | * with rs_lock held, so first grab as many notifications as we can stuff | ||
304 | * in the user provided cmsg buffer. We don't try to copy more, to avoid | ||
305 | * losing notifications - except when the buffer is so small that it wouldn't | ||
306 | * even hold a single notification. Then we give him as much of this single | ||
307 | * msg as we can squeeze in, and set MSG_CTRUNC. | ||
308 | */ | ||
309 | if (msghdr) { | ||
310 | max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); | ||
311 | if (!max_messages) | ||
312 | max_messages = 1; | ||
313 | } | ||
314 | |||
315 | spin_lock_irqsave(&rs->rs_lock, flags); | ||
316 | while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { | ||
317 | notifier = list_entry(rs->rs_notify_queue.next, | ||
318 | struct rds_notifier, n_list); | ||
319 | list_move(¬ifier->n_list, ©); | ||
320 | count++; | ||
321 | } | ||
322 | spin_unlock_irqrestore(&rs->rs_lock, flags); | ||
323 | |||
324 | if (!count) | ||
325 | return 0; | ||
326 | |||
327 | while (!list_empty(©)) { | ||
328 | notifier = list_entry(copy.next, struct rds_notifier, n_list); | ||
329 | |||
330 | if (msghdr) { | ||
331 | cmsg.user_token = notifier->n_user_token; | ||
332 | cmsg.status = notifier->n_status; | ||
333 | |||
334 | err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, | ||
335 | sizeof(cmsg), &cmsg); | ||
336 | if (err) | ||
337 | break; | ||
338 | } | ||
339 | |||
340 | list_del_init(¬ifier->n_list); | ||
341 | kfree(notifier); | ||
342 | } | ||
343 | |||
344 | /* If we bailed out because of an error in put_cmsg, | ||
345 | * we may be left with one or more notifications that we | ||
346 | * didn't process. Return them to the head of the list. */ | ||
347 | if (!list_empty(©)) { | ||
348 | spin_lock_irqsave(&rs->rs_lock, flags); | ||
349 | list_splice(©, &rs->rs_notify_queue); | ||
350 | spin_unlock_irqrestore(&rs->rs_lock, flags); | ||
351 | } | ||
352 | |||
353 | return err; | ||
354 | } | ||
355 | |||
356 | /* | ||
357 | * Queue a congestion notification | ||
358 | */ | ||
359 | static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) | ||
360 | { | ||
361 | uint64_t notify = rs->rs_cong_notify; | ||
362 | unsigned long flags; | ||
363 | int err; | ||
364 | |||
365 | err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, | ||
366 | sizeof(notify), ¬ify); | ||
367 | if (err) | ||
368 | return err; | ||
369 | |||
370 | spin_lock_irqsave(&rs->rs_lock, flags); | ||
371 | rs->rs_cong_notify &= ~notify; | ||
372 | spin_unlock_irqrestore(&rs->rs_lock, flags); | ||
373 | |||
374 | return 0; | ||
375 | } | ||
376 | |||
377 | /* | ||
378 | * Receive any control messages. | ||
379 | */ | ||
380 | static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) | ||
381 | { | ||
382 | int ret = 0; | ||
383 | |||
384 | if (inc->i_rdma_cookie) { | ||
385 | ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, | ||
386 | sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); | ||
387 | if (ret) | ||
388 | return ret; | ||
389 | } | ||
390 | |||
391 | return 0; | ||
392 | } | ||
393 | |||
394 | int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | ||
395 | size_t size, int msg_flags) | ||
396 | { | ||
397 | struct sock *sk = sock->sk; | ||
398 | struct rds_sock *rs = rds_sk_to_rs(sk); | ||
399 | long timeo; | ||
400 | int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; | ||
401 | struct sockaddr_in *sin; | ||
402 | struct rds_incoming *inc = NULL; | ||
403 | |||
404 | /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ | ||
405 | timeo = sock_rcvtimeo(sk, nonblock); | ||
406 | |||
407 | rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); | ||
408 | |||
409 | if (msg_flags & MSG_OOB) | ||
410 | goto out; | ||
411 | |||
412 | /* If there are pending notifications, do those - and nothing else */ | ||
413 | if (!list_empty(&rs->rs_notify_queue)) { | ||
414 | ret = rds_notify_queue_get(rs, msg); | ||
415 | goto out; | ||
416 | } | ||
417 | |||
418 | if (rs->rs_cong_notify) { | ||
419 | ret = rds_notify_cong(rs, msg); | ||
420 | goto out; | ||
421 | } | ||
422 | |||
423 | while (1) { | ||
424 | if (!rds_next_incoming(rs, &inc)) { | ||
425 | if (nonblock) { | ||
426 | ret = -EAGAIN; | ||
427 | break; | ||
428 | } | ||
429 | |||
430 | timeo = wait_event_interruptible_timeout(*sk->sk_sleep, | ||
431 | rds_next_incoming(rs, &inc), | ||
432 | timeo); | ||
433 | rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, | ||
434 | timeo); | ||
435 | if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) | ||
436 | continue; | ||
437 | |||
438 | ret = timeo; | ||
439 | if (ret == 0) | ||
440 | ret = -ETIMEDOUT; | ||
441 | break; | ||
442 | } | ||
443 | |||
444 | rdsdebug("copying inc %p from %pI4:%u to user\n", inc, | ||
445 | &inc->i_conn->c_faddr, | ||
446 | ntohs(inc->i_hdr.h_sport)); | ||
447 | ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov, | ||
448 | size); | ||
449 | if (ret < 0) | ||
450 | break; | ||
451 | |||
452 | /* | ||
453 | * if the message we just copied isn't at the head of the | ||
454 | * recv queue then someone else raced us to return it, try | ||
455 | * to get the next message. | ||
456 | */ | ||
457 | if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { | ||
458 | rds_inc_put(inc); | ||
459 | inc = NULL; | ||
460 | rds_stats_inc(s_recv_deliver_raced); | ||
461 | continue; | ||
462 | } | ||
463 | |||
464 | if (ret < be32_to_cpu(inc->i_hdr.h_len)) { | ||
465 | if (msg_flags & MSG_TRUNC) | ||
466 | ret = be32_to_cpu(inc->i_hdr.h_len); | ||
467 | msg->msg_flags |= MSG_TRUNC; | ||
468 | } | ||
469 | |||
470 | if (rds_cmsg_recv(inc, msg)) { | ||
471 | ret = -EFAULT; | ||
472 | goto out; | ||
473 | } | ||
474 | |||
475 | rds_stats_inc(s_recv_delivered); | ||
476 | |||
477 | sin = (struct sockaddr_in *)msg->msg_name; | ||
478 | if (sin) { | ||
479 | sin->sin_family = AF_INET; | ||
480 | sin->sin_port = inc->i_hdr.h_sport; | ||
481 | sin->sin_addr.s_addr = inc->i_saddr; | ||
482 | memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); | ||
483 | } | ||
484 | break; | ||
485 | } | ||
486 | |||
487 | if (inc) | ||
488 | rds_inc_put(inc); | ||
489 | |||
490 | out: | ||
491 | return ret; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * The socket is being shut down and we're asked to drop messages that were | ||
496 | * queued for recvmsg. The caller has unbound the socket so the receive path | ||
497 | * won't queue any more incoming fragments or messages on the socket. | ||
498 | */ | ||
499 | void rds_clear_recv_queue(struct rds_sock *rs) | ||
500 | { | ||
501 | struct sock *sk = rds_rs_to_sk(rs); | ||
502 | struct rds_incoming *inc, *tmp; | ||
503 | unsigned long flags; | ||
504 | |||
505 | write_lock_irqsave(&rs->rs_recv_lock, flags); | ||
506 | list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { | ||
507 | rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, | ||
508 | -be32_to_cpu(inc->i_hdr.h_len), | ||
509 | inc->i_hdr.h_dport); | ||
510 | list_del_init(&inc->i_item); | ||
511 | rds_inc_put(inc); | ||
512 | } | ||
513 | write_unlock_irqrestore(&rs->rs_recv_lock, flags); | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * inc->i_saddr isn't used here because it is only set in the receive | ||
518 | * path. | ||
519 | */ | ||
520 | void rds_inc_info_copy(struct rds_incoming *inc, | ||
521 | struct rds_info_iterator *iter, | ||
522 | __be32 saddr, __be32 daddr, int flip) | ||
523 | { | ||
524 | struct rds_info_message minfo; | ||
525 | |||
526 | minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); | ||
527 | minfo.len = be32_to_cpu(inc->i_hdr.h_len); | ||
528 | |||
529 | if (flip) { | ||
530 | minfo.laddr = daddr; | ||
531 | minfo.faddr = saddr; | ||
532 | minfo.lport = inc->i_hdr.h_dport; | ||
533 | minfo.fport = inc->i_hdr.h_sport; | ||
534 | } else { | ||
535 | minfo.laddr = saddr; | ||
536 | minfo.faddr = daddr; | ||
537 | minfo.lport = inc->i_hdr.h_sport; | ||
538 | minfo.fport = inc->i_hdr.h_dport; | ||
539 | } | ||
540 | |||
541 | rds_info_copy(iter, &minfo, sizeof(minfo)); | ||
542 | } | ||
diff --git a/net/rds/send.c b/net/rds/send.c new file mode 100644 index 000000000000..1b37364656f0 --- /dev/null +++ b/net/rds/send.c | |||
@@ -0,0 +1,1003 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <net/sock.h> | ||
35 | #include <linux/in.h> | ||
36 | #include <linux/list.h> | ||
37 | |||
38 | #include "rds.h" | ||
39 | #include "rdma.h" | ||
40 | |||
41 | /* When transmitting messages in rds_send_xmit, we need to emerge from | ||
42 | * time to time and briefly release the CPU. Otherwise the softlock watchdog | ||
43 | * will kick our shin. | ||
44 | * Also, it seems fairer to not let one busy connection stall all the | ||
45 | * others. | ||
46 | * | ||
47 | * send_batch_count is the number of times we'll loop in send_xmit. Setting | ||
48 | * it to 0 will restore the old behavior (where we looped until we had | ||
49 | * drained the queue). | ||
50 | */ | ||
51 | static int send_batch_count = 64; | ||
52 | module_param(send_batch_count, int, 0444); | ||
53 | MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); | ||
54 | |||
55 | /* | ||
56 | * Reset the send state. Caller must hold c_send_lock when calling here. | ||
57 | */ | ||
58 | void rds_send_reset(struct rds_connection *conn) | ||
59 | { | ||
60 | struct rds_message *rm, *tmp; | ||
61 | unsigned long flags; | ||
62 | |||
63 | if (conn->c_xmit_rm) { | ||
64 | /* Tell the user the RDMA op is no longer mapped by the | ||
65 | * transport. This isn't entirely true (it's flushed out | ||
66 | * independently) but as the connection is down, there's | ||
67 | * no ongoing RDMA to/from that memory */ | ||
68 | rds_message_unmapped(conn->c_xmit_rm); | ||
69 | rds_message_put(conn->c_xmit_rm); | ||
70 | conn->c_xmit_rm = NULL; | ||
71 | } | ||
72 | conn->c_xmit_sg = 0; | ||
73 | conn->c_xmit_hdr_off = 0; | ||
74 | conn->c_xmit_data_off = 0; | ||
75 | conn->c_xmit_rdma_sent = 0; | ||
76 | |||
77 | conn->c_map_queued = 0; | ||
78 | |||
79 | conn->c_unacked_packets = rds_sysctl_max_unacked_packets; | ||
80 | conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; | ||
81 | |||
82 | /* Mark messages as retransmissions, and move them to the send q */ | ||
83 | spin_lock_irqsave(&conn->c_lock, flags); | ||
84 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | ||
85 | set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); | ||
86 | set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); | ||
87 | } | ||
88 | list_splice_init(&conn->c_retrans, &conn->c_send_queue); | ||
89 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * We're making the concious trade-off here to only send one message | ||
94 | * down the connection at a time. | ||
95 | * Pro: | ||
96 | * - tx queueing is a simple fifo list | ||
97 | * - reassembly is optional and easily done by transports per conn | ||
98 | * - no per flow rx lookup at all, straight to the socket | ||
99 | * - less per-frag memory and wire overhead | ||
100 | * Con: | ||
101 | * - queued acks can be delayed behind large messages | ||
102 | * Depends: | ||
103 | * - small message latency is higher behind queued large messages | ||
104 | * - large message latency isn't starved by intervening small sends | ||
105 | */ | ||
106 | int rds_send_xmit(struct rds_connection *conn) | ||
107 | { | ||
108 | struct rds_message *rm; | ||
109 | unsigned long flags; | ||
110 | unsigned int tmp; | ||
111 | unsigned int send_quota = send_batch_count; | ||
112 | struct scatterlist *sg; | ||
113 | int ret = 0; | ||
114 | int was_empty = 0; | ||
115 | LIST_HEAD(to_be_dropped); | ||
116 | |||
117 | /* | ||
118 | * sendmsg calls here after having queued its message on the send | ||
119 | * queue. We only have one task feeding the connection at a time. If | ||
120 | * another thread is already feeding the queue then we back off. This | ||
121 | * avoids blocking the caller and trading per-connection data between | ||
122 | * caches per message. | ||
123 | * | ||
124 | * The sem holder will issue a retry if they notice that someone queued | ||
125 | * a message after they stopped walking the send queue but before they | ||
126 | * dropped the sem. | ||
127 | */ | ||
128 | if (!mutex_trylock(&conn->c_send_lock)) { | ||
129 | rds_stats_inc(s_send_sem_contention); | ||
130 | ret = -ENOMEM; | ||
131 | goto out; | ||
132 | } | ||
133 | |||
134 | if (conn->c_trans->xmit_prepare) | ||
135 | conn->c_trans->xmit_prepare(conn); | ||
136 | |||
137 | /* | ||
138 | * spin trying to push headers and data down the connection until | ||
139 | * the connection doens't make forward progress. | ||
140 | */ | ||
141 | while (--send_quota) { | ||
142 | /* | ||
143 | * See if need to send a congestion map update if we're | ||
144 | * between sending messages. The send_sem protects our sole | ||
145 | * use of c_map_offset and _bytes. | ||
146 | * Note this is used only by transports that define a special | ||
147 | * xmit_cong_map function. For all others, we create allocate | ||
148 | * a cong_map message and treat it just like any other send. | ||
149 | */ | ||
150 | if (conn->c_map_bytes) { | ||
151 | ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, | ||
152 | conn->c_map_offset); | ||
153 | if (ret <= 0) | ||
154 | break; | ||
155 | |||
156 | conn->c_map_offset += ret; | ||
157 | conn->c_map_bytes -= ret; | ||
158 | if (conn->c_map_bytes) | ||
159 | continue; | ||
160 | } | ||
161 | |||
162 | /* If we're done sending the current message, clear the | ||
163 | * offset and S/G temporaries. | ||
164 | */ | ||
165 | rm = conn->c_xmit_rm; | ||
166 | if (rm != NULL && | ||
167 | conn->c_xmit_hdr_off == sizeof(struct rds_header) && | ||
168 | conn->c_xmit_sg == rm->m_nents) { | ||
169 | conn->c_xmit_rm = NULL; | ||
170 | conn->c_xmit_sg = 0; | ||
171 | conn->c_xmit_hdr_off = 0; | ||
172 | conn->c_xmit_data_off = 0; | ||
173 | conn->c_xmit_rdma_sent = 0; | ||
174 | |||
175 | /* Release the reference to the previous message. */ | ||
176 | rds_message_put(rm); | ||
177 | rm = NULL; | ||
178 | } | ||
179 | |||
180 | /* If we're asked to send a cong map update, do so. | ||
181 | */ | ||
182 | if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { | ||
183 | if (conn->c_trans->xmit_cong_map != NULL) { | ||
184 | conn->c_map_offset = 0; | ||
185 | conn->c_map_bytes = sizeof(struct rds_header) + | ||
186 | RDS_CONG_MAP_BYTES; | ||
187 | continue; | ||
188 | } | ||
189 | |||
190 | rm = rds_cong_update_alloc(conn); | ||
191 | if (IS_ERR(rm)) { | ||
192 | ret = PTR_ERR(rm); | ||
193 | break; | ||
194 | } | ||
195 | |||
196 | conn->c_xmit_rm = rm; | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * Grab the next message from the send queue, if there is one. | ||
201 | * | ||
202 | * c_xmit_rm holds a ref while we're sending this message down | ||
203 | * the connction. We can use this ref while holding the | ||
204 | * send_sem.. rds_send_reset() is serialized with it. | ||
205 | */ | ||
206 | if (rm == NULL) { | ||
207 | unsigned int len; | ||
208 | |||
209 | spin_lock_irqsave(&conn->c_lock, flags); | ||
210 | |||
211 | if (!list_empty(&conn->c_send_queue)) { | ||
212 | rm = list_entry(conn->c_send_queue.next, | ||
213 | struct rds_message, | ||
214 | m_conn_item); | ||
215 | rds_message_addref(rm); | ||
216 | |||
217 | /* | ||
218 | * Move the message from the send queue to the retransmit | ||
219 | * list right away. | ||
220 | */ | ||
221 | list_move_tail(&rm->m_conn_item, &conn->c_retrans); | ||
222 | } | ||
223 | |||
224 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
225 | |||
226 | if (rm == NULL) { | ||
227 | was_empty = 1; | ||
228 | break; | ||
229 | } | ||
230 | |||
231 | /* Unfortunately, the way Infiniband deals with | ||
232 | * RDMA to a bad MR key is by moving the entire | ||
233 | * queue pair to error state. We cold possibly | ||
234 | * recover from that, but right now we drop the | ||
235 | * connection. | ||
236 | * Therefore, we never retransmit messages with RDMA ops. | ||
237 | */ | ||
238 | if (rm->m_rdma_op | ||
239 | && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { | ||
240 | spin_lock_irqsave(&conn->c_lock, flags); | ||
241 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) | ||
242 | list_move(&rm->m_conn_item, &to_be_dropped); | ||
243 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
244 | rds_message_put(rm); | ||
245 | continue; | ||
246 | } | ||
247 | |||
248 | /* Require an ACK every once in a while */ | ||
249 | len = ntohl(rm->m_inc.i_hdr.h_len); | ||
250 | if (conn->c_unacked_packets == 0 | ||
251 | || conn->c_unacked_bytes < len) { | ||
252 | __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); | ||
253 | |||
254 | conn->c_unacked_packets = rds_sysctl_max_unacked_packets; | ||
255 | conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; | ||
256 | rds_stats_inc(s_send_ack_required); | ||
257 | } else { | ||
258 | conn->c_unacked_bytes -= len; | ||
259 | conn->c_unacked_packets--; | ||
260 | } | ||
261 | |||
262 | conn->c_xmit_rm = rm; | ||
263 | } | ||
264 | |||
265 | /* | ||
266 | * Try and send an rdma message. Let's see if we can | ||
267 | * keep this simple and require that the transport either | ||
268 | * send the whole rdma or none of it. | ||
269 | */ | ||
270 | if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { | ||
271 | ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); | ||
272 | if (ret) | ||
273 | break; | ||
274 | conn->c_xmit_rdma_sent = 1; | ||
275 | /* The transport owns the mapped memory for now. | ||
276 | * You can't unmap it while it's on the send queue */ | ||
277 | set_bit(RDS_MSG_MAPPED, &rm->m_flags); | ||
278 | } | ||
279 | |||
280 | if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || | ||
281 | conn->c_xmit_sg < rm->m_nents) { | ||
282 | ret = conn->c_trans->xmit(conn, rm, | ||
283 | conn->c_xmit_hdr_off, | ||
284 | conn->c_xmit_sg, | ||
285 | conn->c_xmit_data_off); | ||
286 | if (ret <= 0) | ||
287 | break; | ||
288 | |||
289 | if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) { | ||
290 | tmp = min_t(int, ret, | ||
291 | sizeof(struct rds_header) - | ||
292 | conn->c_xmit_hdr_off); | ||
293 | conn->c_xmit_hdr_off += tmp; | ||
294 | ret -= tmp; | ||
295 | } | ||
296 | |||
297 | sg = &rm->m_sg[conn->c_xmit_sg]; | ||
298 | while (ret) { | ||
299 | tmp = min_t(int, ret, sg->length - | ||
300 | conn->c_xmit_data_off); | ||
301 | conn->c_xmit_data_off += tmp; | ||
302 | ret -= tmp; | ||
303 | if (conn->c_xmit_data_off == sg->length) { | ||
304 | conn->c_xmit_data_off = 0; | ||
305 | sg++; | ||
306 | conn->c_xmit_sg++; | ||
307 | BUG_ON(ret != 0 && | ||
308 | conn->c_xmit_sg == rm->m_nents); | ||
309 | } | ||
310 | } | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* Nuke any messages we decided not to retransmit. */ | ||
315 | if (!list_empty(&to_be_dropped)) | ||
316 | rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); | ||
317 | |||
318 | if (conn->c_trans->xmit_complete) | ||
319 | conn->c_trans->xmit_complete(conn); | ||
320 | |||
321 | /* | ||
322 | * We might be racing with another sender who queued a message but | ||
323 | * backed off on noticing that we held the c_send_lock. If we check | ||
324 | * for queued messages after dropping the sem then either we'll | ||
325 | * see the queued message or the queuer will get the sem. If we | ||
326 | * notice the queued message then we trigger an immediate retry. | ||
327 | * | ||
328 | * We need to be careful only to do this when we stopped processing | ||
329 | * the send queue because it was empty. It's the only way we | ||
330 | * stop processing the loop when the transport hasn't taken | ||
331 | * responsibility for forward progress. | ||
332 | */ | ||
333 | mutex_unlock(&conn->c_send_lock); | ||
334 | |||
335 | if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { | ||
336 | /* We exhausted the send quota, but there's work left to | ||
337 | * do. Return and (re-)schedule the send worker. | ||
338 | */ | ||
339 | ret = -EAGAIN; | ||
340 | } | ||
341 | |||
342 | if (ret == 0 && was_empty) { | ||
343 | /* A simple bit test would be way faster than taking the | ||
344 | * spin lock */ | ||
345 | spin_lock_irqsave(&conn->c_lock, flags); | ||
346 | if (!list_empty(&conn->c_send_queue)) { | ||
347 | rds_stats_inc(s_send_sem_queue_raced); | ||
348 | ret = -EAGAIN; | ||
349 | } | ||
350 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
351 | } | ||
352 | out: | ||
353 | return ret; | ||
354 | } | ||
355 | |||
356 | static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) | ||
357 | { | ||
358 | u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); | ||
359 | |||
360 | assert_spin_locked(&rs->rs_lock); | ||
361 | |||
362 | BUG_ON(rs->rs_snd_bytes < len); | ||
363 | rs->rs_snd_bytes -= len; | ||
364 | |||
365 | if (rs->rs_snd_bytes == 0) | ||
366 | rds_stats_inc(s_send_queue_empty); | ||
367 | } | ||
368 | |||
369 | static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, | ||
370 | is_acked_func is_acked) | ||
371 | { | ||
372 | if (is_acked) | ||
373 | return is_acked(rm, ack); | ||
374 | return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; | ||
375 | } | ||
376 | |||
377 | /* | ||
378 | * Returns true if there are no messages on the send and retransmit queues | ||
379 | * which have a sequence number greater than or equal to the given sequence | ||
380 | * number. | ||
381 | */ | ||
382 | int rds_send_acked_before(struct rds_connection *conn, u64 seq) | ||
383 | { | ||
384 | struct rds_message *rm, *tmp; | ||
385 | int ret = 1; | ||
386 | |||
387 | spin_lock(&conn->c_lock); | ||
388 | |||
389 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | ||
390 | if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) | ||
391 | ret = 0; | ||
392 | break; | ||
393 | } | ||
394 | |||
395 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { | ||
396 | if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) | ||
397 | ret = 0; | ||
398 | break; | ||
399 | } | ||
400 | |||
401 | spin_unlock(&conn->c_lock); | ||
402 | |||
403 | return ret; | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * This is pretty similar to what happens below in the ACK | ||
408 | * handling code - except that we call here as soon as we get | ||
409 | * the IB send completion on the RDMA op and the accompanying | ||
410 | * message. | ||
411 | */ | ||
412 | void rds_rdma_send_complete(struct rds_message *rm, int status) | ||
413 | { | ||
414 | struct rds_sock *rs = NULL; | ||
415 | struct rds_rdma_op *ro; | ||
416 | struct rds_notifier *notifier; | ||
417 | |||
418 | spin_lock(&rm->m_rs_lock); | ||
419 | |||
420 | ro = rm->m_rdma_op; | ||
421 | if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) | ||
422 | && ro && ro->r_notify && ro->r_notifier) { | ||
423 | notifier = ro->r_notifier; | ||
424 | rs = rm->m_rs; | ||
425 | sock_hold(rds_rs_to_sk(rs)); | ||
426 | |||
427 | notifier->n_status = status; | ||
428 | spin_lock(&rs->rs_lock); | ||
429 | list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); | ||
430 | spin_unlock(&rs->rs_lock); | ||
431 | |||
432 | ro->r_notifier = NULL; | ||
433 | } | ||
434 | |||
435 | spin_unlock(&rm->m_rs_lock); | ||
436 | |||
437 | if (rs) { | ||
438 | rds_wake_sk_sleep(rs); | ||
439 | sock_put(rds_rs_to_sk(rs)); | ||
440 | } | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * This is the same as rds_rdma_send_complete except we | ||
445 | * don't do any locking - we have all the ingredients (message, | ||
446 | * socket, socket lock) and can just move the notifier. | ||
447 | */ | ||
448 | static inline void | ||
449 | __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) | ||
450 | { | ||
451 | struct rds_rdma_op *ro; | ||
452 | |||
453 | ro = rm->m_rdma_op; | ||
454 | if (ro && ro->r_notify && ro->r_notifier) { | ||
455 | ro->r_notifier->n_status = status; | ||
456 | list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); | ||
457 | ro->r_notifier = NULL; | ||
458 | } | ||
459 | |||
460 | /* No need to wake the app - caller does this */ | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * This is called from the IB send completion when we detect | ||
465 | * a RDMA operation that failed with remote access error. | ||
466 | * So speed is not an issue here. | ||
467 | */ | ||
468 | struct rds_message *rds_send_get_message(struct rds_connection *conn, | ||
469 | struct rds_rdma_op *op) | ||
470 | { | ||
471 | struct rds_message *rm, *tmp, *found = NULL; | ||
472 | unsigned long flags; | ||
473 | |||
474 | spin_lock_irqsave(&conn->c_lock, flags); | ||
475 | |||
476 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | ||
477 | if (rm->m_rdma_op == op) { | ||
478 | atomic_inc(&rm->m_refcount); | ||
479 | found = rm; | ||
480 | goto out; | ||
481 | } | ||
482 | } | ||
483 | |||
484 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { | ||
485 | if (rm->m_rdma_op == op) { | ||
486 | atomic_inc(&rm->m_refcount); | ||
487 | found = rm; | ||
488 | break; | ||
489 | } | ||
490 | } | ||
491 | |||
492 | out: | ||
493 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
494 | |||
495 | return found; | ||
496 | } | ||
497 | |||
498 | /* | ||
499 | * This removes messages from the socket's list if they're on it. The list | ||
500 | * argument must be private to the caller, we must be able to modify it | ||
501 | * without locks. The messages must have a reference held for their | ||
502 | * position on the list. This function will drop that reference after | ||
503 | * removing the messages from the 'messages' list regardless of if it found | ||
504 | * the messages on the socket list or not. | ||
505 | */ | ||
506 | void rds_send_remove_from_sock(struct list_head *messages, int status) | ||
507 | { | ||
508 | unsigned long flags = 0; /* silence gcc :P */ | ||
509 | struct rds_sock *rs = NULL; | ||
510 | struct rds_message *rm; | ||
511 | |||
512 | local_irq_save(flags); | ||
513 | while (!list_empty(messages)) { | ||
514 | rm = list_entry(messages->next, struct rds_message, | ||
515 | m_conn_item); | ||
516 | list_del_init(&rm->m_conn_item); | ||
517 | |||
518 | /* | ||
519 | * If we see this flag cleared then we're *sure* that someone | ||
520 | * else beat us to removing it from the sock. If we race | ||
521 | * with their flag update we'll get the lock and then really | ||
522 | * see that the flag has been cleared. | ||
523 | * | ||
524 | * The message spinlock makes sure nobody clears rm->m_rs | ||
525 | * while we're messing with it. It does not prevent the | ||
526 | * message from being removed from the socket, though. | ||
527 | */ | ||
528 | spin_lock(&rm->m_rs_lock); | ||
529 | if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) | ||
530 | goto unlock_and_drop; | ||
531 | |||
532 | if (rs != rm->m_rs) { | ||
533 | if (rs) { | ||
534 | spin_unlock(&rs->rs_lock); | ||
535 | rds_wake_sk_sleep(rs); | ||
536 | sock_put(rds_rs_to_sk(rs)); | ||
537 | } | ||
538 | rs = rm->m_rs; | ||
539 | spin_lock(&rs->rs_lock); | ||
540 | sock_hold(rds_rs_to_sk(rs)); | ||
541 | } | ||
542 | |||
543 | if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { | ||
544 | struct rds_rdma_op *ro = rm->m_rdma_op; | ||
545 | struct rds_notifier *notifier; | ||
546 | |||
547 | list_del_init(&rm->m_sock_item); | ||
548 | rds_send_sndbuf_remove(rs, rm); | ||
549 | |||
550 | if (ro && ro->r_notifier | ||
551 | && (status || ro->r_notify)) { | ||
552 | notifier = ro->r_notifier; | ||
553 | list_add_tail(¬ifier->n_list, | ||
554 | &rs->rs_notify_queue); | ||
555 | if (!notifier->n_status) | ||
556 | notifier->n_status = status; | ||
557 | rm->m_rdma_op->r_notifier = NULL; | ||
558 | } | ||
559 | rds_message_put(rm); | ||
560 | rm->m_rs = NULL; | ||
561 | } | ||
562 | |||
563 | unlock_and_drop: | ||
564 | spin_unlock(&rm->m_rs_lock); | ||
565 | rds_message_put(rm); | ||
566 | } | ||
567 | |||
568 | if (rs) { | ||
569 | spin_unlock(&rs->rs_lock); | ||
570 | rds_wake_sk_sleep(rs); | ||
571 | sock_put(rds_rs_to_sk(rs)); | ||
572 | } | ||
573 | local_irq_restore(flags); | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * Transports call here when they've determined that the receiver queued | ||
578 | * messages up to, and including, the given sequence number. Messages are | ||
579 | * moved to the retrans queue when rds_send_xmit picks them off the send | ||
580 | * queue. This means that in the TCP case, the message may not have been | ||
581 | * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked | ||
582 | * checks the RDS_MSG_HAS_ACK_SEQ bit. | ||
583 | * | ||
584 | * XXX It's not clear to me how this is safely serialized with socket | ||
585 | * destruction. Maybe it should bail if it sees SOCK_DEAD. | ||
586 | */ | ||
587 | void rds_send_drop_acked(struct rds_connection *conn, u64 ack, | ||
588 | is_acked_func is_acked) | ||
589 | { | ||
590 | struct rds_message *rm, *tmp; | ||
591 | unsigned long flags; | ||
592 | LIST_HEAD(list); | ||
593 | |||
594 | spin_lock_irqsave(&conn->c_lock, flags); | ||
595 | |||
596 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | ||
597 | if (!rds_send_is_acked(rm, ack, is_acked)) | ||
598 | break; | ||
599 | |||
600 | list_move(&rm->m_conn_item, &list); | ||
601 | clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); | ||
602 | } | ||
603 | |||
604 | /* order flag updates with spin locks */ | ||
605 | if (!list_empty(&list)) | ||
606 | smp_mb__after_clear_bit(); | ||
607 | |||
608 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
609 | |||
610 | /* now remove the messages from the sock list as needed */ | ||
611 | rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); | ||
612 | } | ||
613 | |||
614 | void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) | ||
615 | { | ||
616 | struct rds_message *rm, *tmp; | ||
617 | struct rds_connection *conn; | ||
618 | unsigned long flags; | ||
619 | LIST_HEAD(list); | ||
620 | int wake = 0; | ||
621 | |||
622 | /* get all the messages we're dropping under the rs lock */ | ||
623 | spin_lock_irqsave(&rs->rs_lock, flags); | ||
624 | |||
625 | list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { | ||
626 | if (dest && (dest->sin_addr.s_addr != rm->m_daddr || | ||
627 | dest->sin_port != rm->m_inc.i_hdr.h_dport)) | ||
628 | continue; | ||
629 | |||
630 | wake = 1; | ||
631 | list_move(&rm->m_sock_item, &list); | ||
632 | rds_send_sndbuf_remove(rs, rm); | ||
633 | clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); | ||
634 | |||
635 | /* If this is a RDMA operation, notify the app. */ | ||
636 | __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); | ||
637 | } | ||
638 | |||
639 | /* order flag updates with the rs lock */ | ||
640 | if (wake) | ||
641 | smp_mb__after_clear_bit(); | ||
642 | |||
643 | spin_unlock_irqrestore(&rs->rs_lock, flags); | ||
644 | |||
645 | if (wake) | ||
646 | rds_wake_sk_sleep(rs); | ||
647 | |||
648 | conn = NULL; | ||
649 | |||
650 | /* now remove the messages from the conn list as needed */ | ||
651 | list_for_each_entry(rm, &list, m_sock_item) { | ||
652 | /* We do this here rather than in the loop above, so that | ||
653 | * we don't have to nest m_rs_lock under rs->rs_lock */ | ||
654 | spin_lock(&rm->m_rs_lock); | ||
655 | rm->m_rs = NULL; | ||
656 | spin_unlock(&rm->m_rs_lock); | ||
657 | |||
658 | /* | ||
659 | * If we see this flag cleared then we're *sure* that someone | ||
660 | * else beat us to removing it from the conn. If we race | ||
661 | * with their flag update we'll get the lock and then really | ||
662 | * see that the flag has been cleared. | ||
663 | */ | ||
664 | if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) | ||
665 | continue; | ||
666 | |||
667 | if (conn != rm->m_inc.i_conn) { | ||
668 | if (conn) | ||
669 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
670 | conn = rm->m_inc.i_conn; | ||
671 | spin_lock_irqsave(&conn->c_lock, flags); | ||
672 | } | ||
673 | |||
674 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { | ||
675 | list_del_init(&rm->m_conn_item); | ||
676 | rds_message_put(rm); | ||
677 | } | ||
678 | } | ||
679 | |||
680 | if (conn) | ||
681 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
682 | |||
683 | while (!list_empty(&list)) { | ||
684 | rm = list_entry(list.next, struct rds_message, m_sock_item); | ||
685 | list_del_init(&rm->m_sock_item); | ||
686 | |||
687 | rds_message_wait(rm); | ||
688 | rds_message_put(rm); | ||
689 | } | ||
690 | } | ||
691 | |||
692 | /* | ||
693 | * we only want this to fire once so we use the callers 'queued'. It's | ||
694 | * possible that another thread can race with us and remove the | ||
695 | * message from the flow with RDS_CANCEL_SENT_TO. | ||
696 | */ | ||
697 | static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, | ||
698 | struct rds_message *rm, __be16 sport, | ||
699 | __be16 dport, int *queued) | ||
700 | { | ||
701 | unsigned long flags; | ||
702 | u32 len; | ||
703 | |||
704 | if (*queued) | ||
705 | goto out; | ||
706 | |||
707 | len = be32_to_cpu(rm->m_inc.i_hdr.h_len); | ||
708 | |||
709 | /* this is the only place which holds both the socket's rs_lock | ||
710 | * and the connection's c_lock */ | ||
711 | spin_lock_irqsave(&rs->rs_lock, flags); | ||
712 | |||
713 | /* | ||
714 | * If there is a little space in sndbuf, we don't queue anything, | ||
715 | * and userspace gets -EAGAIN. But poll() indicates there's send | ||
716 | * room. This can lead to bad behavior (spinning) if snd_bytes isn't | ||
717 | * freed up by incoming acks. So we check the *old* value of | ||
718 | * rs_snd_bytes here to allow the last msg to exceed the buffer, | ||
719 | * and poll() now knows no more data can be sent. | ||
720 | */ | ||
721 | if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { | ||
722 | rs->rs_snd_bytes += len; | ||
723 | |||
724 | /* let recv side know we are close to send space exhaustion. | ||
725 | * This is probably not the optimal way to do it, as this | ||
726 | * means we set the flag on *all* messages as soon as our | ||
727 | * throughput hits a certain threshold. | ||
728 | */ | ||
729 | if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) | ||
730 | __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); | ||
731 | |||
732 | list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); | ||
733 | set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); | ||
734 | rds_message_addref(rm); | ||
735 | rm->m_rs = rs; | ||
736 | |||
737 | /* The code ordering is a little weird, but we're | ||
738 | trying to minimize the time we hold c_lock */ | ||
739 | rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); | ||
740 | rm->m_inc.i_conn = conn; | ||
741 | rds_message_addref(rm); | ||
742 | |||
743 | spin_lock(&conn->c_lock); | ||
744 | rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); | ||
745 | list_add_tail(&rm->m_conn_item, &conn->c_send_queue); | ||
746 | set_bit(RDS_MSG_ON_CONN, &rm->m_flags); | ||
747 | spin_unlock(&conn->c_lock); | ||
748 | |||
749 | rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", | ||
750 | rm, len, rs, rs->rs_snd_bytes, | ||
751 | (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); | ||
752 | |||
753 | *queued = 1; | ||
754 | } | ||
755 | |||
756 | spin_unlock_irqrestore(&rs->rs_lock, flags); | ||
757 | out: | ||
758 | return *queued; | ||
759 | } | ||
760 | |||
761 | static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, | ||
762 | struct msghdr *msg, int *allocated_mr) | ||
763 | { | ||
764 | struct cmsghdr *cmsg; | ||
765 | int ret = 0; | ||
766 | |||
767 | for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { | ||
768 | if (!CMSG_OK(msg, cmsg)) | ||
769 | return -EINVAL; | ||
770 | |||
771 | if (cmsg->cmsg_level != SOL_RDS) | ||
772 | continue; | ||
773 | |||
774 | /* As a side effect, RDMA_DEST and RDMA_MAP will set | ||
775 | * rm->m_rdma_cookie and rm->m_rdma_mr. | ||
776 | */ | ||
777 | switch (cmsg->cmsg_type) { | ||
778 | case RDS_CMSG_RDMA_ARGS: | ||
779 | ret = rds_cmsg_rdma_args(rs, rm, cmsg); | ||
780 | break; | ||
781 | |||
782 | case RDS_CMSG_RDMA_DEST: | ||
783 | ret = rds_cmsg_rdma_dest(rs, rm, cmsg); | ||
784 | break; | ||
785 | |||
786 | case RDS_CMSG_RDMA_MAP: | ||
787 | ret = rds_cmsg_rdma_map(rs, rm, cmsg); | ||
788 | if (!ret) | ||
789 | *allocated_mr = 1; | ||
790 | break; | ||
791 | |||
792 | default: | ||
793 | return -EINVAL; | ||
794 | } | ||
795 | |||
796 | if (ret) | ||
797 | break; | ||
798 | } | ||
799 | |||
800 | return ret; | ||
801 | } | ||
802 | |||
803 | int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | ||
804 | size_t payload_len) | ||
805 | { | ||
806 | struct sock *sk = sock->sk; | ||
807 | struct rds_sock *rs = rds_sk_to_rs(sk); | ||
808 | struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; | ||
809 | __be32 daddr; | ||
810 | __be16 dport; | ||
811 | struct rds_message *rm = NULL; | ||
812 | struct rds_connection *conn; | ||
813 | int ret = 0; | ||
814 | int queued = 0, allocated_mr = 0; | ||
815 | int nonblock = msg->msg_flags & MSG_DONTWAIT; | ||
816 | long timeo = sock_rcvtimeo(sk, nonblock); | ||
817 | |||
818 | /* Mirror Linux UDP mirror of BSD error message compatibility */ | ||
819 | /* XXX: Perhaps MSG_MORE someday */ | ||
820 | if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { | ||
821 | printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags); | ||
822 | ret = -EOPNOTSUPP; | ||
823 | goto out; | ||
824 | } | ||
825 | |||
826 | if (msg->msg_namelen) { | ||
827 | /* XXX fail non-unicast destination IPs? */ | ||
828 | if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { | ||
829 | ret = -EINVAL; | ||
830 | goto out; | ||
831 | } | ||
832 | daddr = usin->sin_addr.s_addr; | ||
833 | dport = usin->sin_port; | ||
834 | } else { | ||
835 | /* We only care about consistency with ->connect() */ | ||
836 | lock_sock(sk); | ||
837 | daddr = rs->rs_conn_addr; | ||
838 | dport = rs->rs_conn_port; | ||
839 | release_sock(sk); | ||
840 | } | ||
841 | |||
842 | /* racing with another thread binding seems ok here */ | ||
843 | if (daddr == 0 || rs->rs_bound_addr == 0) { | ||
844 | ret = -ENOTCONN; /* XXX not a great errno */ | ||
845 | goto out; | ||
846 | } | ||
847 | |||
848 | rm = rds_message_copy_from_user(msg->msg_iov, payload_len); | ||
849 | if (IS_ERR(rm)) { | ||
850 | ret = PTR_ERR(rm); | ||
851 | rm = NULL; | ||
852 | goto out; | ||
853 | } | ||
854 | |||
855 | rm->m_daddr = daddr; | ||
856 | |||
857 | /* Parse any control messages the user may have included. */ | ||
858 | ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); | ||
859 | if (ret) | ||
860 | goto out; | ||
861 | |||
862 | /* rds_conn_create has a spinlock that runs with IRQ off. | ||
863 | * Caching the conn in the socket helps a lot. */ | ||
864 | if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) | ||
865 | conn = rs->rs_conn; | ||
866 | else { | ||
867 | conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, | ||
868 | rs->rs_transport, | ||
869 | sock->sk->sk_allocation); | ||
870 | if (IS_ERR(conn)) { | ||
871 | ret = PTR_ERR(conn); | ||
872 | goto out; | ||
873 | } | ||
874 | rs->rs_conn = conn; | ||
875 | } | ||
876 | |||
877 | if ((rm->m_rdma_cookie || rm->m_rdma_op) | ||
878 | && conn->c_trans->xmit_rdma == NULL) { | ||
879 | if (printk_ratelimit()) | ||
880 | printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", | ||
881 | rm->m_rdma_op, conn->c_trans->xmit_rdma); | ||
882 | ret = -EOPNOTSUPP; | ||
883 | goto out; | ||
884 | } | ||
885 | |||
886 | /* If the connection is down, trigger a connect. We may | ||
887 | * have scheduled a delayed reconnect however - in this case | ||
888 | * we should not interfere. | ||
889 | */ | ||
890 | if (rds_conn_state(conn) == RDS_CONN_DOWN | ||
891 | && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | ||
892 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | ||
893 | |||
894 | ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); | ||
895 | if (ret) | ||
896 | goto out; | ||
897 | |||
898 | while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, | ||
899 | dport, &queued)) { | ||
900 | rds_stats_inc(s_send_queue_full); | ||
901 | /* XXX make sure this is reasonable */ | ||
902 | if (payload_len > rds_sk_sndbuf(rs)) { | ||
903 | ret = -EMSGSIZE; | ||
904 | goto out; | ||
905 | } | ||
906 | if (nonblock) { | ||
907 | ret = -EAGAIN; | ||
908 | goto out; | ||
909 | } | ||
910 | |||
911 | timeo = wait_event_interruptible_timeout(*sk->sk_sleep, | ||
912 | rds_send_queue_rm(rs, conn, rm, | ||
913 | rs->rs_bound_port, | ||
914 | dport, | ||
915 | &queued), | ||
916 | timeo); | ||
917 | rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); | ||
918 | if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) | ||
919 | continue; | ||
920 | |||
921 | ret = timeo; | ||
922 | if (ret == 0) | ||
923 | ret = -ETIMEDOUT; | ||
924 | goto out; | ||
925 | } | ||
926 | |||
927 | /* | ||
928 | * By now we've committed to the send. We reuse rds_send_worker() | ||
929 | * to retry sends in the rds thread if the transport asks us to. | ||
930 | */ | ||
931 | rds_stats_inc(s_send_queued); | ||
932 | |||
933 | if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) | ||
934 | rds_send_worker(&conn->c_send_w.work); | ||
935 | |||
936 | rds_message_put(rm); | ||
937 | return payload_len; | ||
938 | |||
939 | out: | ||
940 | /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. | ||
941 | * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN | ||
942 | * or in any other way, we need to destroy the MR again */ | ||
943 | if (allocated_mr) | ||
944 | rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); | ||
945 | |||
946 | if (rm) | ||
947 | rds_message_put(rm); | ||
948 | return ret; | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * Reply to a ping packet. | ||
953 | */ | ||
954 | int | ||
955 | rds_send_pong(struct rds_connection *conn, __be16 dport) | ||
956 | { | ||
957 | struct rds_message *rm; | ||
958 | unsigned long flags; | ||
959 | int ret = 0; | ||
960 | |||
961 | rm = rds_message_alloc(0, GFP_ATOMIC); | ||
962 | if (rm == NULL) { | ||
963 | ret = -ENOMEM; | ||
964 | goto out; | ||
965 | } | ||
966 | |||
967 | rm->m_daddr = conn->c_faddr; | ||
968 | |||
969 | /* If the connection is down, trigger a connect. We may | ||
970 | * have scheduled a delayed reconnect however - in this case | ||
971 | * we should not interfere. | ||
972 | */ | ||
973 | if (rds_conn_state(conn) == RDS_CONN_DOWN | ||
974 | && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | ||
975 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | ||
976 | |||
977 | ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); | ||
978 | if (ret) | ||
979 | goto out; | ||
980 | |||
981 | spin_lock_irqsave(&conn->c_lock, flags); | ||
982 | list_add_tail(&rm->m_conn_item, &conn->c_send_queue); | ||
983 | set_bit(RDS_MSG_ON_CONN, &rm->m_flags); | ||
984 | rds_message_addref(rm); | ||
985 | rm->m_inc.i_conn = conn; | ||
986 | |||
987 | rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, | ||
988 | conn->c_next_tx_seq); | ||
989 | conn->c_next_tx_seq++; | ||
990 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
991 | |||
992 | rds_stats_inc(s_send_queued); | ||
993 | rds_stats_inc(s_send_pong); | ||
994 | |||
995 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
996 | rds_message_put(rm); | ||
997 | return 0; | ||
998 | |||
999 | out: | ||
1000 | if (rm) | ||
1001 | rds_message_put(rm); | ||
1002 | return ret; | ||
1003 | } | ||
diff --git a/net/rds/stats.c b/net/rds/stats.c new file mode 100644 index 000000000000..637146893cf3 --- /dev/null +++ b/net/rds/stats.c | |||
@@ -0,0 +1,148 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/percpu.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | |||
39 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | ||
40 | |||
41 | /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ | ||
42 | |||
43 | static char *rds_stat_names[] = { | ||
44 | "conn_reset", | ||
45 | "recv_drop_bad_checksum", | ||
46 | "recv_drop_old_seq", | ||
47 | "recv_drop_no_sock", | ||
48 | "recv_drop_dead_sock", | ||
49 | "recv_deliver_raced", | ||
50 | "recv_delivered", | ||
51 | "recv_queued", | ||
52 | "recv_immediate_retry", | ||
53 | "recv_delayed_retry", | ||
54 | "recv_ack_required", | ||
55 | "recv_rdma_bytes", | ||
56 | "recv_ping", | ||
57 | "send_queue_empty", | ||
58 | "send_queue_full", | ||
59 | "send_sem_contention", | ||
60 | "send_sem_queue_raced", | ||
61 | "send_immediate_retry", | ||
62 | "send_delayed_retry", | ||
63 | "send_drop_acked", | ||
64 | "send_ack_required", | ||
65 | "send_queued", | ||
66 | "send_rdma", | ||
67 | "send_rdma_bytes", | ||
68 | "send_pong", | ||
69 | "page_remainder_hit", | ||
70 | "page_remainder_miss", | ||
71 | "copy_to_user", | ||
72 | "copy_from_user", | ||
73 | "cong_update_queued", | ||
74 | "cong_update_received", | ||
75 | "cong_send_error", | ||
76 | "cong_send_blocked", | ||
77 | }; | ||
78 | |||
79 | void rds_stats_info_copy(struct rds_info_iterator *iter, | ||
80 | uint64_t *values, char **names, size_t nr) | ||
81 | { | ||
82 | struct rds_info_counter ctr; | ||
83 | size_t i; | ||
84 | |||
85 | for (i = 0; i < nr; i++) { | ||
86 | BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); | ||
87 | strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); | ||
88 | ctr.value = values[i]; | ||
89 | |||
90 | rds_info_copy(iter, &ctr, sizeof(ctr)); | ||
91 | } | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * This gives global counters across all the transports. The strings | ||
96 | * are copied in so that the tool doesn't need knowledge of the specific | ||
97 | * stats that we're exporting. Some are pretty implementation dependent | ||
98 | * and may change over time. That doesn't stop them from being useful. | ||
99 | * | ||
100 | * This is the only function in the chain that knows about the byte granular | ||
101 | * length in userspace. It converts it to number of stat entries that the | ||
102 | * rest of the functions operate in. | ||
103 | */ | ||
104 | static void rds_stats_info(struct socket *sock, unsigned int len, | ||
105 | struct rds_info_iterator *iter, | ||
106 | struct rds_info_lengths *lens) | ||
107 | { | ||
108 | struct rds_statistics stats = {0, }; | ||
109 | uint64_t *src; | ||
110 | uint64_t *sum; | ||
111 | size_t i; | ||
112 | int cpu; | ||
113 | unsigned int avail; | ||
114 | |||
115 | avail = len / sizeof(struct rds_info_counter); | ||
116 | |||
117 | if (avail < ARRAY_SIZE(rds_stat_names)) { | ||
118 | avail = 0; | ||
119 | goto trans; | ||
120 | } | ||
121 | |||
122 | for_each_online_cpu(cpu) { | ||
123 | src = (uint64_t *)&(per_cpu(rds_stats, cpu)); | ||
124 | sum = (uint64_t *)&stats; | ||
125 | for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) | ||
126 | *(sum++) += *(src++); | ||
127 | } | ||
128 | |||
129 | rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, | ||
130 | ARRAY_SIZE(rds_stat_names)); | ||
131 | avail -= ARRAY_SIZE(rds_stat_names); | ||
132 | |||
133 | trans: | ||
134 | lens->each = sizeof(struct rds_info_counter); | ||
135 | lens->nr = rds_trans_stats_info_copy(iter, avail) + | ||
136 | ARRAY_SIZE(rds_stat_names); | ||
137 | } | ||
138 | |||
139 | void rds_stats_exit(void) | ||
140 | { | ||
141 | rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); | ||
142 | } | ||
143 | |||
144 | int __init rds_stats_init(void) | ||
145 | { | ||
146 | rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); | ||
147 | return 0; | ||
148 | } | ||
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c new file mode 100644 index 000000000000..307dc5c1be15 --- /dev/null +++ b/net/rds/sysctl.c | |||
@@ -0,0 +1,122 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/sysctl.h> | ||
35 | #include <linux/proc_fs.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | |||
39 | static struct ctl_table_header *rds_sysctl_reg_table; | ||
40 | |||
41 | static unsigned long rds_sysctl_reconnect_min = 1; | ||
42 | static unsigned long rds_sysctl_reconnect_max = ~0UL; | ||
43 | |||
44 | unsigned long rds_sysctl_reconnect_min_jiffies; | ||
45 | unsigned long rds_sysctl_reconnect_max_jiffies = HZ; | ||
46 | |||
47 | unsigned int rds_sysctl_max_unacked_packets = 8; | ||
48 | unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); | ||
49 | |||
50 | unsigned int rds_sysctl_ping_enable = 1; | ||
51 | |||
52 | static ctl_table rds_sysctl_rds_table[] = { | ||
53 | { | ||
54 | .ctl_name = CTL_UNNUMBERED, | ||
55 | .procname = "reconnect_min_delay_ms", | ||
56 | .data = &rds_sysctl_reconnect_min_jiffies, | ||
57 | .maxlen = sizeof(unsigned long), | ||
58 | .mode = 0644, | ||
59 | .proc_handler = &proc_doulongvec_ms_jiffies_minmax, | ||
60 | .extra1 = &rds_sysctl_reconnect_min, | ||
61 | .extra2 = &rds_sysctl_reconnect_max_jiffies, | ||
62 | }, | ||
63 | { | ||
64 | .ctl_name = CTL_UNNUMBERED, | ||
65 | .procname = "reconnect_max_delay_ms", | ||
66 | .data = &rds_sysctl_reconnect_max_jiffies, | ||
67 | .maxlen = sizeof(unsigned long), | ||
68 | .mode = 0644, | ||
69 | .proc_handler = &proc_doulongvec_ms_jiffies_minmax, | ||
70 | .extra1 = &rds_sysctl_reconnect_min_jiffies, | ||
71 | .extra2 = &rds_sysctl_reconnect_max, | ||
72 | }, | ||
73 | { | ||
74 | .ctl_name = CTL_UNNUMBERED, | ||
75 | .procname = "max_unacked_packets", | ||
76 | .data = &rds_sysctl_max_unacked_packets, | ||
77 | .maxlen = sizeof(unsigned long), | ||
78 | .mode = 0644, | ||
79 | .proc_handler = &proc_dointvec, | ||
80 | }, | ||
81 | { | ||
82 | .ctl_name = CTL_UNNUMBERED, | ||
83 | .procname = "max_unacked_bytes", | ||
84 | .data = &rds_sysctl_max_unacked_bytes, | ||
85 | .maxlen = sizeof(unsigned long), | ||
86 | .mode = 0644, | ||
87 | .proc_handler = &proc_dointvec, | ||
88 | }, | ||
89 | { | ||
90 | .ctl_name = CTL_UNNUMBERED, | ||
91 | .procname = "ping_enable", | ||
92 | .data = &rds_sysctl_ping_enable, | ||
93 | .maxlen = sizeof(int), | ||
94 | .mode = 0644, | ||
95 | .proc_handler = &proc_dointvec, | ||
96 | }, | ||
97 | { .ctl_name = 0} | ||
98 | }; | ||
99 | |||
100 | static struct ctl_path rds_sysctl_path[] = { | ||
101 | { .procname = "net", .ctl_name = CTL_NET, }, | ||
102 | { .procname = "rds", .ctl_name = CTL_UNNUMBERED, }, | ||
103 | { } | ||
104 | }; | ||
105 | |||
106 | |||
107 | void rds_sysctl_exit(void) | ||
108 | { | ||
109 | if (rds_sysctl_reg_table) | ||
110 | unregister_sysctl_table(rds_sysctl_reg_table); | ||
111 | } | ||
112 | |||
113 | int __init rds_sysctl_init(void) | ||
114 | { | ||
115 | rds_sysctl_reconnect_min = msecs_to_jiffies(1); | ||
116 | rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; | ||
117 | |||
118 | rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); | ||
119 | if (rds_sysctl_reg_table == NULL) | ||
120 | return -ENOMEM; | ||
121 | return 0; | ||
122 | } | ||
diff --git a/net/rds/threads.c b/net/rds/threads.c new file mode 100644 index 000000000000..828a1bf9ea92 --- /dev/null +++ b/net/rds/threads.c | |||
@@ -0,0 +1,265 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/random.h> | ||
35 | |||
36 | #include "rds.h" | ||
37 | |||
38 | /* | ||
39 | * All of connection management is simplified by serializing it through | ||
40 | * work queues that execute in a connection managing thread. | ||
41 | * | ||
42 | * TCP wants to send acks through sendpage() in response to data_ready(), | ||
43 | * but it needs a process context to do so. | ||
44 | * | ||
45 | * The receive paths need to allocate but can't drop packets (!) so we have | ||
46 | * a thread around to block allocating if the receive fast path sees an | ||
47 | * allocation failure. | ||
48 | */ | ||
49 | |||
50 | /* Grand Unified Theory of connection life cycle: | ||
51 | * At any point in time, the connection can be in one of these states: | ||
52 | * DOWN, CONNECTING, UP, DISCONNECTING, ERROR | ||
53 | * | ||
54 | * The following transitions are possible: | ||
55 | * ANY -> ERROR | ||
56 | * UP -> DISCONNECTING | ||
57 | * ERROR -> DISCONNECTING | ||
58 | * DISCONNECTING -> DOWN | ||
59 | * DOWN -> CONNECTING | ||
60 | * CONNECTING -> UP | ||
61 | * | ||
62 | * Transition to state DISCONNECTING/DOWN: | ||
63 | * - Inside the shutdown worker; synchronizes with xmit path | ||
64 | * through c_send_lock, and with connection management callbacks | ||
65 | * via c_cm_lock. | ||
66 | * | ||
67 | * For receive callbacks, we rely on the underlying transport | ||
68 | * (TCP, IB/RDMA) to provide the necessary synchronisation. | ||
69 | */ | ||
70 | struct workqueue_struct *rds_wq; | ||
71 | |||
72 | void rds_connect_complete(struct rds_connection *conn) | ||
73 | { | ||
74 | if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) { | ||
75 | printk(KERN_WARNING "%s: Cannot transition to state UP, " | ||
76 | "current state is %d\n", | ||
77 | __func__, | ||
78 | atomic_read(&conn->c_state)); | ||
79 | atomic_set(&conn->c_state, RDS_CONN_ERROR); | ||
80 | queue_work(rds_wq, &conn->c_down_w); | ||
81 | return; | ||
82 | } | ||
83 | |||
84 | rdsdebug("conn %p for %pI4 to %pI4 complete\n", | ||
85 | conn, &conn->c_laddr, &conn->c_faddr); | ||
86 | |||
87 | conn->c_reconnect_jiffies = 0; | ||
88 | set_bit(0, &conn->c_map_queued); | ||
89 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
90 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * This random exponential backoff is relied on to eventually resolve racing | ||
95 | * connects. | ||
96 | * | ||
97 | * If connect attempts race then both parties drop both connections and come | ||
98 | * here to wait for a random amount of time before trying again. Eventually | ||
99 | * the backoff range will be so much greater than the time it takes to | ||
100 | * establish a connection that one of the pair will establish the connection | ||
101 | * before the other's random delay fires. | ||
102 | * | ||
103 | * Connection attempts that arrive while a connection is already established | ||
104 | * are also considered to be racing connects. This lets a connection from | ||
105 | * a rebooted machine replace an existing stale connection before the transport | ||
106 | * notices that the connection has failed. | ||
107 | * | ||
108 | * We should *always* start with a random backoff; otherwise a broken connection | ||
109 | * will always take several iterations to be re-established. | ||
110 | */ | ||
111 | static void rds_queue_reconnect(struct rds_connection *conn) | ||
112 | { | ||
113 | unsigned long rand; | ||
114 | |||
115 | rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", | ||
116 | conn, &conn->c_laddr, &conn->c_faddr, | ||
117 | conn->c_reconnect_jiffies); | ||
118 | |||
119 | set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); | ||
120 | if (conn->c_reconnect_jiffies == 0) { | ||
121 | conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; | ||
122 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | ||
123 | return; | ||
124 | } | ||
125 | |||
126 | get_random_bytes(&rand, sizeof(rand)); | ||
127 | rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", | ||
128 | rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, | ||
129 | conn, &conn->c_laddr, &conn->c_faddr); | ||
130 | queue_delayed_work(rds_wq, &conn->c_conn_w, | ||
131 | rand % conn->c_reconnect_jiffies); | ||
132 | |||
133 | conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, | ||
134 | rds_sysctl_reconnect_max_jiffies); | ||
135 | } | ||
136 | |||
137 | void rds_connect_worker(struct work_struct *work) | ||
138 | { | ||
139 | struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); | ||
140 | int ret; | ||
141 | |||
142 | clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); | ||
143 | if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { | ||
144 | ret = conn->c_trans->conn_connect(conn); | ||
145 | rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", | ||
146 | conn, &conn->c_laddr, &conn->c_faddr, ret); | ||
147 | |||
148 | if (ret) { | ||
149 | if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) | ||
150 | rds_queue_reconnect(conn); | ||
151 | else | ||
152 | rds_conn_error(conn, "RDS: connect failed\n"); | ||
153 | } | ||
154 | } | ||
155 | } | ||
156 | |||
157 | void rds_shutdown_worker(struct work_struct *work) | ||
158 | { | ||
159 | struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); | ||
160 | |||
161 | /* shut it down unless it's down already */ | ||
162 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { | ||
163 | /* | ||
164 | * Quiesce the connection mgmt handlers before we start tearing | ||
165 | * things down. We don't hold the mutex for the entire | ||
166 | * duration of the shutdown operation, else we may be | ||
167 | * deadlocking with the CM handler. Instead, the CM event | ||
168 | * handler is supposed to check for state DISCONNECTING | ||
169 | */ | ||
170 | mutex_lock(&conn->c_cm_lock); | ||
171 | if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) | ||
172 | && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { | ||
173 | rds_conn_error(conn, "shutdown called in state %d\n", | ||
174 | atomic_read(&conn->c_state)); | ||
175 | mutex_unlock(&conn->c_cm_lock); | ||
176 | return; | ||
177 | } | ||
178 | mutex_unlock(&conn->c_cm_lock); | ||
179 | |||
180 | mutex_lock(&conn->c_send_lock); | ||
181 | conn->c_trans->conn_shutdown(conn); | ||
182 | rds_conn_reset(conn); | ||
183 | mutex_unlock(&conn->c_send_lock); | ||
184 | |||
185 | if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { | ||
186 | /* This can happen - eg when we're in the middle of tearing | ||
187 | * down the connection, and someone unloads the rds module. | ||
188 | * Quite reproduceable with loopback connections. | ||
189 | * Mostly harmless. | ||
190 | */ | ||
191 | rds_conn_error(conn, | ||
192 | "%s: failed to transition to state DOWN, " | ||
193 | "current state is %d\n", | ||
194 | __func__, | ||
195 | atomic_read(&conn->c_state)); | ||
196 | return; | ||
197 | } | ||
198 | } | ||
199 | |||
200 | /* Then reconnect if it's still live. | ||
201 | * The passive side of an IB loopback connection is never added | ||
202 | * to the conn hash, so we never trigger a reconnect on this | ||
203 | * conn - the reconnect is always triggered by the active peer. */ | ||
204 | cancel_delayed_work(&conn->c_conn_w); | ||
205 | if (!hlist_unhashed(&conn->c_hash_node)) | ||
206 | rds_queue_reconnect(conn); | ||
207 | } | ||
208 | |||
209 | void rds_send_worker(struct work_struct *work) | ||
210 | { | ||
211 | struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); | ||
212 | int ret; | ||
213 | |||
214 | if (rds_conn_state(conn) == RDS_CONN_UP) { | ||
215 | ret = rds_send_xmit(conn); | ||
216 | rdsdebug("conn %p ret %d\n", conn, ret); | ||
217 | switch (ret) { | ||
218 | case -EAGAIN: | ||
219 | rds_stats_inc(s_send_immediate_retry); | ||
220 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | ||
221 | break; | ||
222 | case -ENOMEM: | ||
223 | rds_stats_inc(s_send_delayed_retry); | ||
224 | queue_delayed_work(rds_wq, &conn->c_send_w, 2); | ||
225 | default: | ||
226 | break; | ||
227 | } | ||
228 | } | ||
229 | } | ||
230 | |||
231 | void rds_recv_worker(struct work_struct *work) | ||
232 | { | ||
233 | struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); | ||
234 | int ret; | ||
235 | |||
236 | if (rds_conn_state(conn) == RDS_CONN_UP) { | ||
237 | ret = conn->c_trans->recv(conn); | ||
238 | rdsdebug("conn %p ret %d\n", conn, ret); | ||
239 | switch (ret) { | ||
240 | case -EAGAIN: | ||
241 | rds_stats_inc(s_recv_immediate_retry); | ||
242 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | ||
243 | break; | ||
244 | case -ENOMEM: | ||
245 | rds_stats_inc(s_recv_delayed_retry); | ||
246 | queue_delayed_work(rds_wq, &conn->c_recv_w, 2); | ||
247 | default: | ||
248 | break; | ||
249 | } | ||
250 | } | ||
251 | } | ||
252 | |||
253 | void rds_threads_exit(void) | ||
254 | { | ||
255 | destroy_workqueue(rds_wq); | ||
256 | } | ||
257 | |||
258 | int __init rds_threads_init(void) | ||
259 | { | ||
260 | rds_wq = create_singlethread_workqueue("krdsd"); | ||
261 | if (rds_wq == NULL) | ||
262 | return -ENOMEM; | ||
263 | |||
264 | return 0; | ||
265 | } | ||
diff --git a/net/rds/transport.c b/net/rds/transport.c new file mode 100644 index 000000000000..767da61ad2f3 --- /dev/null +++ b/net/rds/transport.c | |||
@@ -0,0 +1,117 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/module.h> | ||
35 | #include <linux/in.h> | ||
36 | |||
37 | #include "rds.h" | ||
38 | #include "loop.h" | ||
39 | |||
40 | static LIST_HEAD(rds_transports); | ||
41 | static DECLARE_RWSEM(rds_trans_sem); | ||
42 | |||
43 | int rds_trans_register(struct rds_transport *trans) | ||
44 | { | ||
45 | BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); | ||
46 | |||
47 | down_write(&rds_trans_sem); | ||
48 | |||
49 | list_add_tail(&trans->t_item, &rds_transports); | ||
50 | printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); | ||
51 | |||
52 | up_write(&rds_trans_sem); | ||
53 | |||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | void rds_trans_unregister(struct rds_transport *trans) | ||
58 | { | ||
59 | down_write(&rds_trans_sem); | ||
60 | |||
61 | list_del_init(&trans->t_item); | ||
62 | printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); | ||
63 | |||
64 | up_write(&rds_trans_sem); | ||
65 | } | ||
66 | |||
67 | struct rds_transport *rds_trans_get_preferred(__be32 addr) | ||
68 | { | ||
69 | struct rds_transport *trans; | ||
70 | struct rds_transport *ret = NULL; | ||
71 | |||
72 | if (IN_LOOPBACK(ntohl(addr))) | ||
73 | return &rds_loop_transport; | ||
74 | |||
75 | down_read(&rds_trans_sem); | ||
76 | list_for_each_entry(trans, &rds_transports, t_item) { | ||
77 | if (trans->laddr_check(addr) == 0) { | ||
78 | ret = trans; | ||
79 | break; | ||
80 | } | ||
81 | } | ||
82 | up_read(&rds_trans_sem); | ||
83 | |||
84 | return ret; | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * This returns the number of stats entries in the snapshot and only | ||
89 | * copies them using the iter if there is enough space for them. The | ||
90 | * caller passes in the global stats so that we can size and copy while | ||
91 | * holding the lock. | ||
92 | */ | ||
93 | unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, | ||
94 | unsigned int avail) | ||
95 | |||
96 | { | ||
97 | struct rds_transport *trans; | ||
98 | unsigned int total = 0; | ||
99 | unsigned int part; | ||
100 | |||
101 | rds_info_iter_unmap(iter); | ||
102 | down_read(&rds_trans_sem); | ||
103 | |||
104 | list_for_each_entry(trans, &rds_transports, t_item) { | ||
105 | if (trans->stats_info_copy == NULL) | ||
106 | continue; | ||
107 | |||
108 | part = trans->stats_info_copy(iter, avail); | ||
109 | avail -= min(avail, part); | ||
110 | total += part; | ||
111 | } | ||
112 | |||
113 | up_read(&rds_trans_sem); | ||
114 | |||
115 | return total; | ||
116 | } | ||
117 | |||