aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sunrpc/debug.h5
-rw-r--r--net/sunrpc/Makefile1
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c9
-rw-r--r--net/sunrpc/xprtrdma/transport.c800
-rw-r--r--net/sunrpc/xprtrdma/verbs.c37
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h330
7 files changed, 1185 insertions, 0 deletions
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 3912cf16361e..3347c72b848a 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -88,6 +88,11 @@ enum {
88 CTL_SLOTTABLE_TCP, 88 CTL_SLOTTABLE_TCP,
89 CTL_MIN_RESVPORT, 89 CTL_MIN_RESVPORT,
90 CTL_MAX_RESVPORT, 90 CTL_MAX_RESVPORT,
91 CTL_SLOTTABLE_RDMA,
92 CTL_RDMA_MAXINLINEREAD,
93 CTL_RDMA_MAXINLINEWRITE,
94 CTL_RDMA_WRITEPADDING,
95 CTL_RDMA_MEMREG,
91}; 96};
92 97
93#endif /* _LINUX_SUNRPC_DEBUG_H_ */ 98#endif /* _LINUX_SUNRPC_DEBUG_H_ */
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8ebfc4db7f51..5c69a725e530 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,6 +5,7 @@
5 5
6obj-$(CONFIG_SUNRPC) += sunrpc.o 6obj-$(CONFIG_SUNRPC) += sunrpc.o
7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ 7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
8obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
8 9
9sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ 10sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
10 auth.o auth_null.o auth_unix.o \ 11 auth.o auth_null.o auth_unix.o \
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 000000000000..264f0feeb513
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 000000000000..b0587f3a5d77
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,9 @@
1/*
2 * Placeholders for subsequent patches
3 */
4
5#include "xprt_rdma.h"
6
7void rpcrdma_conn_func(struct rpcrdma_ep *a) { }
8void rpcrdma_reply_handler(struct rpcrdma_rep *a) { }
9int rpcrdma_marshal_req(struct rpc_rqst *a) { return EINVAL; }
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 000000000000..dc55cc974c90
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,800 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40/*
41 * transport.c
42 *
43 * This file contains the top-level implementation of an RPC RDMA
44 * transport.
45 *
46 * Naming convention: functions beginning with xprt_ are part of the
47 * transport switch. All others are RPC RDMA internal.
48 */
49
50#include <linux/module.h>
51#include <linux/init.h>
52#include <linux/seq_file.h>
53
54#include "xprt_rdma.h"
55
56#ifdef RPC_DEBUG
57# define RPCDBG_FACILITY RPCDBG_TRANS
58#endif
59
60MODULE_LICENSE("Dual BSD/GPL");
61
62MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
63MODULE_AUTHOR("Network Appliance, Inc.");
64
65/*
66 * tunables
67 */
68
69static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
70static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
71static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
72static unsigned int xprt_rdma_inline_write_padding;
73#if !RPCRDMA_PERSISTENT_REGISTRATION
74static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
75#else
76static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
77#endif
78
79#ifdef RPC_DEBUG
80
81static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
82static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
83static unsigned int zero;
84static unsigned int max_padding = PAGE_SIZE;
85static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
86static unsigned int max_memreg = RPCRDMA_LAST - 1;
87
88static struct ctl_table_header *sunrpc_table_header;
89
90static ctl_table xr_tunables_table[] = {
91 {
92 .ctl_name = CTL_SLOTTABLE_RDMA,
93 .procname = "rdma_slot_table_entries",
94 .data = &xprt_rdma_slot_table_entries,
95 .maxlen = sizeof(unsigned int),
96 .mode = 0644,
97 .proc_handler = &proc_dointvec_minmax,
98 .strategy = &sysctl_intvec,
99 .extra1 = &min_slot_table_size,
100 .extra2 = &max_slot_table_size
101 },
102 {
103 .ctl_name = CTL_RDMA_MAXINLINEREAD,
104 .procname = "rdma_max_inline_read",
105 .data = &xprt_rdma_max_inline_read,
106 .maxlen = sizeof(unsigned int),
107 .mode = 0644,
108 .proc_handler = &proc_dointvec,
109 .strategy = &sysctl_intvec,
110 },
111 {
112 .ctl_name = CTL_RDMA_MAXINLINEWRITE,
113 .procname = "rdma_max_inline_write",
114 .data = &xprt_rdma_max_inline_write,
115 .maxlen = sizeof(unsigned int),
116 .mode = 0644,
117 .proc_handler = &proc_dointvec,
118 .strategy = &sysctl_intvec,
119 },
120 {
121 .ctl_name = CTL_RDMA_WRITEPADDING,
122 .procname = "rdma_inline_write_padding",
123 .data = &xprt_rdma_inline_write_padding,
124 .maxlen = sizeof(unsigned int),
125 .mode = 0644,
126 .proc_handler = &proc_dointvec_minmax,
127 .strategy = &sysctl_intvec,
128 .extra1 = &zero,
129 .extra2 = &max_padding,
130 },
131 {
132 .ctl_name = CTL_RDMA_MEMREG,
133 .procname = "rdma_memreg_strategy",
134 .data = &xprt_rdma_memreg_strategy,
135 .maxlen = sizeof(unsigned int),
136 .mode = 0644,
137 .proc_handler = &proc_dointvec_minmax,
138 .strategy = &sysctl_intvec,
139 .extra1 = &min_memreg,
140 .extra2 = &max_memreg,
141 },
142 {
143 .ctl_name = 0,
144 },
145};
146
147static ctl_table sunrpc_table[] = {
148 {
149 .ctl_name = CTL_SUNRPC,
150 .procname = "sunrpc",
151 .mode = 0555,
152 .child = xr_tunables_table
153 },
154 {
155 .ctl_name = 0,
156 },
157};
158
159#endif
160
161static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
162
163static void
164xprt_rdma_format_addresses(struct rpc_xprt *xprt)
165{
166 struct sockaddr_in *addr = (struct sockaddr_in *)
167 &rpcx_to_rdmad(xprt).addr;
168 char *buf;
169
170 buf = kzalloc(20, GFP_KERNEL);
171 if (buf)
172 snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr));
173 xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
174
175 buf = kzalloc(8, GFP_KERNEL);
176 if (buf)
177 snprintf(buf, 8, "%u", ntohs(addr->sin_port));
178 xprt->address_strings[RPC_DISPLAY_PORT] = buf;
179
180 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
181
182 buf = kzalloc(48, GFP_KERNEL);
183 if (buf)
184 snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
185 NIPQUAD(addr->sin_addr.s_addr),
186 ntohs(addr->sin_port), "rdma");
187 xprt->address_strings[RPC_DISPLAY_ALL] = buf;
188
189 buf = kzalloc(10, GFP_KERNEL);
190 if (buf)
191 snprintf(buf, 10, "%02x%02x%02x%02x",
192 NIPQUAD(addr->sin_addr.s_addr));
193 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
194
195 buf = kzalloc(8, GFP_KERNEL);
196 if (buf)
197 snprintf(buf, 8, "%4hx", ntohs(addr->sin_port));
198 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
199
200 buf = kzalloc(30, GFP_KERNEL);
201 if (buf)
202 snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
203 NIPQUAD(addr->sin_addr.s_addr),
204 ntohs(addr->sin_port) >> 8,
205 ntohs(addr->sin_port) & 0xff);
206 xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
207
208 /* netid */
209 xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
210}
211
212static void
213xprt_rdma_free_addresses(struct rpc_xprt *xprt)
214{
215 kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
216 kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
217 kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
218 kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]);
219 kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
220 kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]);
221}
222
223static void
224xprt_rdma_connect_worker(struct work_struct *work)
225{
226 struct rpcrdma_xprt *r_xprt =
227 container_of(work, struct rpcrdma_xprt, rdma_connect.work);
228 struct rpc_xprt *xprt = &r_xprt->xprt;
229 int rc = 0;
230
231 if (!xprt->shutdown) {
232 xprt_clear_connected(xprt);
233
234 dprintk("RPC: %s: %sconnect\n", __func__,
235 r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
236 rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
237 if (rc)
238 goto out;
239 }
240 goto out_clear;
241
242out:
243 xprt_wake_pending_tasks(xprt, rc);
244
245out_clear:
246 dprintk("RPC: %s: exit\n", __func__);
247 xprt_clear_connecting(xprt);
248}
249
250/*
251 * xprt_rdma_destroy
252 *
253 * Destroy the xprt.
254 * Free all memory associated with the object, including its own.
255 * NOTE: none of the *destroy methods free memory for their top-level
256 * objects, even though they may have allocated it (they do free
257 * private memory). It's up to the caller to handle it. In this
258 * case (RDMA transport), all structure memory is inlined with the
259 * struct rpcrdma_xprt.
260 */
261static void
262xprt_rdma_destroy(struct rpc_xprt *xprt)
263{
264 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
265 int rc;
266
267 dprintk("RPC: %s: called\n", __func__);
268
269 cancel_delayed_work(&r_xprt->rdma_connect);
270 flush_scheduled_work();
271
272 xprt_clear_connected(xprt);
273
274 rpcrdma_buffer_destroy(&r_xprt->rx_buf);
275 rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
276 if (rc)
277 dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
278 __func__, rc);
279 rpcrdma_ia_close(&r_xprt->rx_ia);
280
281 xprt_rdma_free_addresses(xprt);
282
283 kfree(xprt->slot);
284 xprt->slot = NULL;
285 kfree(xprt);
286
287 dprintk("RPC: %s: returning\n", __func__);
288
289 module_put(THIS_MODULE);
290}
291
292/**
293 * xprt_setup_rdma - Set up transport to use RDMA
294 *
295 * @args: rpc transport arguments
296 */
297static struct rpc_xprt *
298xprt_setup_rdma(struct xprt_create *args)
299{
300 struct rpcrdma_create_data_internal cdata;
301 struct rpc_xprt *xprt;
302 struct rpcrdma_xprt *new_xprt;
303 struct rpcrdma_ep *new_ep;
304 struct sockaddr_in *sin;
305 int rc;
306
307 if (args->addrlen > sizeof(xprt->addr)) {
308 dprintk("RPC: %s: address too large\n", __func__);
309 return ERR_PTR(-EBADF);
310 }
311
312 xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL);
313 if (xprt == NULL) {
314 dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
315 __func__);
316 return ERR_PTR(-ENOMEM);
317 }
318
319 xprt->max_reqs = xprt_rdma_slot_table_entries;
320 xprt->slot = kcalloc(xprt->max_reqs,
321 sizeof(struct rpc_rqst), GFP_KERNEL);
322 if (xprt->slot == NULL) {
323 kfree(xprt);
324 dprintk("RPC: %s: couldn't allocate %d slots\n",
325 __func__, xprt->max_reqs);
326 return ERR_PTR(-ENOMEM);
327 }
328
329 /* 60 second timeout, no retries */
330 xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ);
331 xprt->bind_timeout = (60U * HZ);
332 xprt->connect_timeout = (60U * HZ);
333 xprt->reestablish_timeout = (5U * HZ);
334 xprt->idle_timeout = (5U * 60 * HZ);
335
336 xprt->resvport = 0; /* privileged port not needed */
337 xprt->tsh_size = 0; /* RPC-RDMA handles framing */
338 xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
339 xprt->ops = &xprt_rdma_procs;
340
341 /*
342 * Set up RDMA-specific connect data.
343 */
344
345 /* Put server RDMA address in local cdata */
346 memcpy(&cdata.addr, args->dstaddr, args->addrlen);
347
348 /* Ensure xprt->addr holds valid server TCP (not RDMA)
349 * address, for any side protocols which peek at it */
350 xprt->prot = IPPROTO_TCP;
351 xprt->addrlen = args->addrlen;
352 memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
353
354 sin = (struct sockaddr_in *)&cdata.addr;
355 if (ntohs(sin->sin_port) != 0)
356 xprt_set_bound(xprt);
357
358 dprintk("RPC: %s: %u.%u.%u.%u:%u\n", __func__,
359 NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
360
361 /* Set max requests */
362 cdata.max_requests = xprt->max_reqs;
363
364 /* Set some length limits */
365 cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
366 cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
367
368 cdata.inline_wsize = xprt_rdma_max_inline_write;
369 if (cdata.inline_wsize > cdata.wsize)
370 cdata.inline_wsize = cdata.wsize;
371
372 cdata.inline_rsize = xprt_rdma_max_inline_read;
373 if (cdata.inline_rsize > cdata.rsize)
374 cdata.inline_rsize = cdata.rsize;
375
376 cdata.padding = xprt_rdma_inline_write_padding;
377
378 /*
379 * Create new transport instance, which includes initialized
380 * o ia
381 * o endpoint
382 * o buffers
383 */
384
385 new_xprt = rpcx_to_rdmax(xprt);
386
387 rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
388 xprt_rdma_memreg_strategy);
389 if (rc)
390 goto out1;
391
392 /*
393 * initialize and create ep
394 */
395 new_xprt->rx_data = cdata;
396 new_ep = &new_xprt->rx_ep;
397 new_ep->rep_remote_addr = cdata.addr;
398
399 rc = rpcrdma_ep_create(&new_xprt->rx_ep,
400 &new_xprt->rx_ia, &new_xprt->rx_data);
401 if (rc)
402 goto out2;
403
404 /*
405 * Allocate pre-registered send and receive buffers for headers and
406 * any inline data. Also specify any padding which will be provided
407 * from a preregistered zero buffer.
408 */
409 rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
410 &new_xprt->rx_data);
411 if (rc)
412 goto out3;
413
414 /*
415 * Register a callback for connection events. This is necessary because
416 * connection loss notification is async. We also catch connection loss
417 * when reaping receives.
418 */
419 INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
420 new_ep->rep_func = rpcrdma_conn_func;
421 new_ep->rep_xprt = xprt;
422
423 xprt_rdma_format_addresses(xprt);
424
425 if (!try_module_get(THIS_MODULE))
426 goto out4;
427
428 return xprt;
429
430out4:
431 xprt_rdma_free_addresses(xprt);
432 rc = -EINVAL;
433out3:
434 (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
435out2:
436 rpcrdma_ia_close(&new_xprt->rx_ia);
437out1:
438 kfree(xprt->slot);
439 kfree(xprt);
440 return ERR_PTR(rc);
441}
442
443/*
444 * Close a connection, during shutdown or timeout/reconnect
445 */
446static void
447xprt_rdma_close(struct rpc_xprt *xprt)
448{
449 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
450
451 dprintk("RPC: %s: closing\n", __func__);
452 xprt_disconnect(xprt);
453 (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
454}
455
456static void
457xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
458{
459 struct sockaddr_in *sap;
460
461 sap = (struct sockaddr_in *)&xprt->addr;
462 sap->sin_port = htons(port);
463 sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
464 sap->sin_port = htons(port);
465 dprintk("RPC: %s: %u\n", __func__, port);
466}
467
468static void
469xprt_rdma_connect(struct rpc_task *task)
470{
471 struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt;
472 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
473
474 if (!xprt_test_and_set_connecting(xprt)) {
475 if (r_xprt->rx_ep.rep_connected != 0) {
476 /* Reconnect */
477 schedule_delayed_work(&r_xprt->rdma_connect,
478 xprt->reestablish_timeout);
479 } else {
480 schedule_delayed_work(&r_xprt->rdma_connect, 0);
481 if (!RPC_IS_ASYNC(task))
482 flush_scheduled_work();
483 }
484 }
485}
486
487static int
488xprt_rdma_reserve_xprt(struct rpc_task *task)
489{
490 struct rpc_xprt *xprt = task->tk_xprt;
491 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
492 int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
493
494 /* == RPC_CWNDSCALE @ init, but *after* setup */
495 if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
496 r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
497 dprintk("RPC: %s: cwndscale %lu\n", __func__,
498 r_xprt->rx_buf.rb_cwndscale);
499 BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
500 }
501 xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
502 return xprt_reserve_xprt_cong(task);
503}
504
505/*
506 * The RDMA allocate/free functions need the task structure as a place
507 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
508 * sequence. For this reason, the recv buffers are attached to send
509 * buffers for portions of the RPC. Note that the RPC layer allocates
510 * both send and receive buffers in the same call. We may register
511 * the receive buffer portion when using reply chunks.
512 */
513static void *
514xprt_rdma_allocate(struct rpc_task *task, size_t size)
515{
516 struct rpc_xprt *xprt = task->tk_xprt;
517 struct rpcrdma_req *req, *nreq;
518
519 req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
520 BUG_ON(NULL == req);
521
522 if (size > req->rl_size) {
523 dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
524 "prog %d vers %d proc %d\n",
525 __func__, size, req->rl_size,
526 task->tk_client->cl_prog, task->tk_client->cl_vers,
527 task->tk_msg.rpc_proc->p_proc);
528 /*
529 * Outgoing length shortage. Our inline write max must have
530 * been configured to perform direct i/o.
531 *
532 * This is therefore a large metadata operation, and the
533 * allocate call was made on the maximum possible message,
534 * e.g. containing long filename(s) or symlink data. In
535 * fact, while these metadata operations *might* carry
536 * large outgoing payloads, they rarely *do*. However, we
537 * have to commit to the request here, so reallocate and
538 * register it now. The data path will never require this
539 * reallocation.
540 *
541 * If the allocation or registration fails, the RPC framework
542 * will (doggedly) retry.
543 */
544 if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
545 RPCRDMA_BOUNCEBUFFERS) {
546 /* forced to "pure inline" */
547 dprintk("RPC: %s: too much data (%zd) for inline "
548 "(r/w max %d/%d)\n", __func__, size,
549 rpcx_to_rdmad(xprt).inline_rsize,
550 rpcx_to_rdmad(xprt).inline_wsize);
551 size = req->rl_size;
552 rpc_exit(task, -EIO); /* fail the operation */
553 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
554 goto out;
555 }
556 if (task->tk_flags & RPC_TASK_SWAPPER)
557 nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
558 else
559 nreq = kmalloc(sizeof *req + size, GFP_NOFS);
560 if (nreq == NULL)
561 goto outfail;
562
563 if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
564 nreq->rl_base, size + sizeof(struct rpcrdma_req)
565 - offsetof(struct rpcrdma_req, rl_base),
566 &nreq->rl_handle, &nreq->rl_iov)) {
567 kfree(nreq);
568 goto outfail;
569 }
570 rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
571 nreq->rl_size = size;
572 nreq->rl_niovs = 0;
573 nreq->rl_nchunks = 0;
574 nreq->rl_buffer = (struct rpcrdma_buffer *)req;
575 nreq->rl_reply = req->rl_reply;
576 memcpy(nreq->rl_segments,
577 req->rl_segments, sizeof nreq->rl_segments);
578 /* flag the swap with an unused field */
579 nreq->rl_iov.length = 0;
580 req->rl_reply = NULL;
581 req = nreq;
582 }
583 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
584out:
585 return req->rl_xdr_buf;
586
587outfail:
588 rpcrdma_buffer_put(req);
589 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
590 return NULL;
591}
592
593/*
594 * This function returns all RDMA resources to the pool.
595 */
596static void
597xprt_rdma_free(void *buffer)
598{
599 struct rpcrdma_req *req;
600 struct rpcrdma_xprt *r_xprt;
601 struct rpcrdma_rep *rep;
602 int i;
603
604 if (buffer == NULL)
605 return;
606
607 req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
608 r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
609 rep = req->rl_reply;
610
611 dprintk("RPC: %s: called on 0x%p%s\n",
612 __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
613
614 /*
615 * Finish the deregistration. When using mw bind, this was
616 * begun in rpcrdma_reply_handler(). In all other modes, we
617 * do it here, in thread context. The process is considered
618 * complete when the rr_func vector becomes NULL - this
619 * was put in place during rpcrdma_reply_handler() - the wait
620 * call below will not block if the dereg is "done". If
621 * interrupted, our framework will clean up.
622 */
623 for (i = 0; req->rl_nchunks;) {
624 --req->rl_nchunks;
625 i += rpcrdma_deregister_external(
626 &req->rl_segments[i], r_xprt, NULL);
627 }
628
629 if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
630 rep->rr_func = NULL; /* abandon the callback */
631 req->rl_reply = NULL;
632 }
633
634 if (req->rl_iov.length == 0) { /* see allocate above */
635 struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
636 oreq->rl_reply = req->rl_reply;
637 (void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
638 req->rl_handle,
639 &req->rl_iov);
640 kfree(req);
641 req = oreq;
642 }
643
644 /* Put back request+reply buffers */
645 rpcrdma_buffer_put(req);
646}
647
648/*
649 * send_request invokes the meat of RPC RDMA. It must do the following:
650 * 1. Marshal the RPC request into an RPC RDMA request, which means
651 * putting a header in front of data, and creating IOVs for RDMA
652 * from those in the request.
653 * 2. In marshaling, detect opportunities for RDMA, and use them.
654 * 3. Post a recv message to set up asynch completion, then send
655 * the request (rpcrdma_ep_post).
656 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
657 */
658
659static int
660xprt_rdma_send_request(struct rpc_task *task)
661{
662 struct rpc_rqst *rqst = task->tk_rqstp;
663 struct rpc_xprt *xprt = task->tk_xprt;
664 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
665 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
666
667 /* marshal the send itself */
668 if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
669 r_xprt->rx_stats.failed_marshal_count++;
670 dprintk("RPC: %s: rpcrdma_marshal_req failed\n",
671 __func__);
672 return -EIO;
673 }
674
675 if (req->rl_reply == NULL) /* e.g. reconnection */
676 rpcrdma_recv_buffer_get(req);
677
678 if (req->rl_reply) {
679 req->rl_reply->rr_func = rpcrdma_reply_handler;
680 /* this need only be done once, but... */
681 req->rl_reply->rr_xprt = xprt;
682 }
683
684 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
685 xprt_disconnect(xprt);
686 return -ENOTCONN; /* implies disconnect */
687 }
688
689 rqst->rq_bytes_sent = 0;
690 return 0;
691}
692
693static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
694{
695 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
696 long idle_time = 0;
697
698 if (xprt_connected(xprt))
699 idle_time = (long)(jiffies - xprt->last_used) / HZ;
700
701 seq_printf(seq,
702 "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
703 "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
704
705 0, /* need a local port? */
706 xprt->stat.bind_count,
707 xprt->stat.connect_count,
708 xprt->stat.connect_time,
709 idle_time,
710 xprt->stat.sends,
711 xprt->stat.recvs,
712 xprt->stat.bad_xids,
713 xprt->stat.req_u,
714 xprt->stat.bklog_u,
715
716 r_xprt->rx_stats.read_chunk_count,
717 r_xprt->rx_stats.write_chunk_count,
718 r_xprt->rx_stats.reply_chunk_count,
719 r_xprt->rx_stats.total_rdma_request,
720 r_xprt->rx_stats.total_rdma_reply,
721 r_xprt->rx_stats.pullup_copy_count,
722 r_xprt->rx_stats.fixup_copy_count,
723 r_xprt->rx_stats.hardway_register_count,
724 r_xprt->rx_stats.failed_marshal_count,
725 r_xprt->rx_stats.bad_reply_count);
726}
727
728/*
729 * Plumbing for rpc transport switch and kernel module
730 */
731
732static struct rpc_xprt_ops xprt_rdma_procs = {
733 .reserve_xprt = xprt_rdma_reserve_xprt,
734 .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
735 .release_request = xprt_release_rqst_cong, /* ditto */
736 .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
737 .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
738 .set_port = xprt_rdma_set_port,
739 .connect = xprt_rdma_connect,
740 .buf_alloc = xprt_rdma_allocate,
741 .buf_free = xprt_rdma_free,
742 .send_request = xprt_rdma_send_request,
743 .close = xprt_rdma_close,
744 .destroy = xprt_rdma_destroy,
745 .print_stats = xprt_rdma_print_stats
746};
747
748static struct xprt_class xprt_rdma = {
749 .list = LIST_HEAD_INIT(xprt_rdma.list),
750 .name = "rdma",
751 .owner = THIS_MODULE,
752 .ident = XPRT_TRANSPORT_RDMA,
753 .setup = xprt_setup_rdma,
754};
755
756static void __exit xprt_rdma_cleanup(void)
757{
758 int rc;
759
760 dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
761#ifdef RPC_DEBUG
762 if (sunrpc_table_header) {
763 unregister_sysctl_table(sunrpc_table_header);
764 sunrpc_table_header = NULL;
765 }
766#endif
767 rc = xprt_unregister_transport(&xprt_rdma);
768 if (rc)
769 dprintk("RPC: %s: xprt_unregister returned %i\n",
770 __func__, rc);
771}
772
773static int __init xprt_rdma_init(void)
774{
775 int rc;
776
777 rc = xprt_register_transport(&xprt_rdma);
778
779 if (rc)
780 return rc;
781
782 dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
783
784 dprintk(KERN_INFO "Defaults:\n");
785 dprintk(KERN_INFO "\tSlots %d\n"
786 "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
787 xprt_rdma_slot_table_entries,
788 xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
789 dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
790 xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
791
792#ifdef RPC_DEBUG
793 if (!sunrpc_table_header)
794 sunrpc_table_header = register_sysctl_table(sunrpc_table);
795#endif
796 return 0;
797}
798
799module_init(xprt_rdma_init);
800module_exit(xprt_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 000000000000..0baf53381987
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,37 @@
1/*
2 * Placeholders for subsequent patches
3 */
4
5#include "xprt_rdma.h"
6
7int rpcrdma_ia_open(struct rpcrdma_xprt *a, struct sockaddr *b, int c)
8{ return EINVAL; }
9void rpcrdma_ia_close(struct rpcrdma_ia *a) { }
10int rpcrdma_ep_create(struct rpcrdma_ep *a, struct rpcrdma_ia *b,
11struct rpcrdma_create_data_internal *c) { return EINVAL; }
12int rpcrdma_ep_destroy(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
13{ return EINVAL; }
14int rpcrdma_ep_connect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
15{ return EINVAL; }
16int rpcrdma_ep_disconnect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
17{ return EINVAL; }
18int rpcrdma_ep_post(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
19struct rpcrdma_req *c) { return EINVAL; }
20int rpcrdma_ep_post_recv(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
21struct rpcrdma_rep *c) { return EINVAL; }
22int rpcrdma_buffer_create(struct rpcrdma_buffer *a, struct rpcrdma_ep *b,
23struct rpcrdma_ia *c, struct rpcrdma_create_data_internal *d) { return EINVAL; }
24void rpcrdma_buffer_destroy(struct rpcrdma_buffer *a) { }
25struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *a)
26{ return NULL; }
27void rpcrdma_buffer_put(struct rpcrdma_req *a) { }
28void rpcrdma_recv_buffer_get(struct rpcrdma_req *a) { }
29void rpcrdma_recv_buffer_put(struct rpcrdma_rep *a) { }
30int rpcrdma_register_internal(struct rpcrdma_ia *a, void *b, int c,
31struct ib_mr **d, struct ib_sge *e) { return EINVAL; }
32int rpcrdma_deregister_internal(struct rpcrdma_ia *a, struct ib_mr *b,
33struct ib_sge *c) { return EINVAL; }
34int rpcrdma_register_external(struct rpcrdma_mr_seg *a, int b, int c,
35struct rpcrdma_xprt *d) { return EINVAL; }
36int rpcrdma_deregister_external(struct rpcrdma_mr_seg *a,
37struct rpcrdma_xprt *b, void *c) { return EINVAL; }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 000000000000..2427822f8bd4
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,330 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
41#define _LINUX_SUNRPC_XPRT_RDMA_H
42
43#include <linux/wait.h> /* wait_queue_head_t, etc */
44#include <linux/spinlock.h> /* spinlock_t, etc */
45#include <asm/atomic.h> /* atomic_t, etc */
46
47#include <rdma/rdma_cm.h> /* RDMA connection api */
48#include <rdma/ib_verbs.h> /* RDMA verbs api */
49
50#include <linux/sunrpc/clnt.h> /* rpc_xprt */
51#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
52#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
53
54/*
55 * Interface Adapter -- one per transport instance
56 */
57struct rpcrdma_ia {
58 struct rdma_cm_id *ri_id;
59 struct ib_pd *ri_pd;
60 struct ib_mr *ri_bind_mem;
61 struct completion ri_done;
62 int ri_async_rc;
63 enum rpcrdma_memreg ri_memreg_strategy;
64};
65
66/*
67 * RDMA Endpoint -- one per transport instance
68 */
69
70struct rpcrdma_ep {
71 atomic_t rep_cqcount;
72 int rep_cqinit;
73 int rep_connected;
74 struct rpcrdma_ia *rep_ia;
75 struct ib_cq *rep_cq;
76 struct ib_qp_init_attr rep_attr;
77 wait_queue_head_t rep_connect_wait;
78 struct ib_sge rep_pad; /* holds zeroed pad */
79 struct ib_mr *rep_pad_mr; /* holds zeroed pad */
80 void (*rep_func)(struct rpcrdma_ep *);
81 struct rpc_xprt *rep_xprt; /* for rep_func */
82 struct rdma_conn_param rep_remote_cma;
83 struct sockaddr_storage rep_remote_addr;
84};
85
86#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
87#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
88
89/*
90 * struct rpcrdma_rep -- this structure encapsulates state required to recv
91 * and complete a reply, asychronously. It needs several pieces of
92 * state:
93 * o recv buffer (posted to provider)
94 * o ib_sge (also donated to provider)
95 * o status of reply (length, success or not)
96 * o bookkeeping state to get run by tasklet (list, etc)
97 *
98 * These are allocated during initialization, per-transport instance;
99 * however, the tasklet execution list itself is global, as it should
100 * always be pretty short.
101 *
102 * N of these are associated with a transport instance, and stored in
103 * struct rpcrdma_buffer. N is the max number of outstanding requests.
104 */
105
106/* temporary static scatter/gather max */
107#define RPCRDMA_MAX_DATA_SEGS (8) /* max scatter/gather */
108#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
109#define MAX_RPCRDMAHDR (\
110 /* max supported RPC/RDMA header */ \
111 sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
112 (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
113
114struct rpcrdma_buffer;
115
116struct rpcrdma_rep {
117 unsigned int rr_len; /* actual received reply length */
118 struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
119 struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
120 void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
121 struct list_head rr_list; /* tasklet list */
122 wait_queue_head_t rr_unbind; /* optional unbind wait */
123 struct ib_sge rr_iov; /* for posting */
124 struct ib_mr *rr_handle; /* handle for mem in rr_iov */
125 char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
126};
127
128/*
129 * struct rpcrdma_req -- structure central to the request/reply sequence.
130 *
131 * N of these are associated with a transport instance, and stored in
132 * struct rpcrdma_buffer. N is the max number of outstanding requests.
133 *
134 * It includes pre-registered buffer memory for send AND recv.
135 * The recv buffer, however, is not owned by this structure, and
136 * is "donated" to the hardware when a recv is posted. When a
137 * reply is handled, the recv buffer used is given back to the
138 * struct rpcrdma_req associated with the request.
139 *
140 * In addition to the basic memory, this structure includes an array
141 * of iovs for send operations. The reason is that the iovs passed to
142 * ib_post_{send,recv} must not be modified until the work request
143 * completes.
144 *
145 * NOTES:
146 * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
147 * marshal. The number needed varies depending on the iov lists that
148 * are passed to us, the memory registration mode we are in, and if
149 * physical addressing is used, the layout.
150 */
151
152struct rpcrdma_mr_seg { /* chunk descriptors */
153 union { /* chunk memory handles */
154 struct ib_mr *rl_mr; /* if registered directly */
155 struct rpcrdma_mw { /* if registered from region */
156 union {
157 struct ib_mw *mw;
158 struct ib_fmr *fmr;
159 } r;
160 struct list_head mw_list;
161 } *rl_mw;
162 } mr_chunk;
163 u64 mr_base; /* registration result */
164 u32 mr_rkey; /* registration result */
165 u32 mr_len; /* length of chunk or segment */
166 int mr_nsegs; /* number of segments in chunk or 0 */
167 enum dma_data_direction mr_dir; /* segment mapping direction */
168 dma_addr_t mr_dma; /* segment mapping address */
169 size_t mr_dmalen; /* segment mapping length */
170 struct page *mr_page; /* owning page, if any */
171 char *mr_offset; /* kva if no page, else offset */
172};
173
174struct rpcrdma_req {
175 size_t rl_size; /* actual length of buffer */
176 unsigned int rl_niovs; /* 0, 2 or 4 */
177 unsigned int rl_nchunks; /* non-zero if chunks */
178 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
179 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
180 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
181 struct ib_sge rl_send_iov[4]; /* for active requests */
182 struct ib_sge rl_iov; /* for posting */
183 struct ib_mr *rl_handle; /* handle for mem in rl_iov */
184 char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
185 __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */
186};
187#define rpcr_to_rdmar(r) \
188 container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
189
190/*
191 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
192 * inline requests/replies, and client/server credits.
193 *
194 * One of these is associated with a transport instance
195 */
196struct rpcrdma_buffer {
197 spinlock_t rb_lock; /* protects indexes */
198 atomic_t rb_credits; /* most recent server credits */
199 unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
200 int rb_max_requests;/* client max requests */
201 struct list_head rb_mws; /* optional memory windows/fmrs */
202 int rb_send_index;
203 struct rpcrdma_req **rb_send_bufs;
204 int rb_recv_index;
205 struct rpcrdma_rep **rb_recv_bufs;
206 char *rb_pool;
207};
208#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
209
210/*
211 * Internal structure for transport instance creation. This
212 * exists primarily for modularity.
213 *
214 * This data should be set with mount options
215 */
216struct rpcrdma_create_data_internal {
217 struct sockaddr_storage addr; /* RDMA server address */
218 unsigned int max_requests; /* max requests (slots) in flight */
219 unsigned int rsize; /* mount rsize - max read hdr+data */
220 unsigned int wsize; /* mount wsize - max write hdr+data */
221 unsigned int inline_rsize; /* max non-rdma read data payload */
222 unsigned int inline_wsize; /* max non-rdma write data payload */
223 unsigned int padding; /* non-rdma write header padding */
224};
225
226#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
227 (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
228
229#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
230 (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
231
232#define RPCRDMA_INLINE_PAD_VALUE(rq)\
233 rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
234
235/*
236 * Statistics for RPCRDMA
237 */
238struct rpcrdma_stats {
239 unsigned long read_chunk_count;
240 unsigned long write_chunk_count;
241 unsigned long reply_chunk_count;
242
243 unsigned long long total_rdma_request;
244 unsigned long long total_rdma_reply;
245
246 unsigned long long pullup_copy_count;
247 unsigned long long fixup_copy_count;
248 unsigned long hardway_register_count;
249 unsigned long failed_marshal_count;
250 unsigned long bad_reply_count;
251};
252
253/*
254 * RPCRDMA transport -- encapsulates the structures above for
255 * integration with RPC.
256 *
257 * The contained structures are embedded, not pointers,
258 * for convenience. This structure need not be visible externally.
259 *
260 * It is allocated and initialized during mount, and released
261 * during unmount.
262 */
263struct rpcrdma_xprt {
264 struct rpc_xprt xprt;
265 struct rpcrdma_ia rx_ia;
266 struct rpcrdma_ep rx_ep;
267 struct rpcrdma_buffer rx_buf;
268 struct rpcrdma_create_data_internal rx_data;
269 struct delayed_work rdma_connect;
270 struct rpcrdma_stats rx_stats;
271};
272
273#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
274#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
275
276/*
277 * Interface Adapter calls - xprtrdma/verbs.c
278 */
279int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
280void rpcrdma_ia_close(struct rpcrdma_ia *);
281
282/*
283 * Endpoint calls - xprtrdma/verbs.c
284 */
285int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
286 struct rpcrdma_create_data_internal *);
287int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
288int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
289int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
290
291int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
292 struct rpcrdma_req *);
293int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
294 struct rpcrdma_rep *);
295
296/*
297 * Buffer calls - xprtrdma/verbs.c
298 */
299int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
300 struct rpcrdma_ia *,
301 struct rpcrdma_create_data_internal *);
302void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
303
304struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
305void rpcrdma_buffer_put(struct rpcrdma_req *);
306void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
307void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
308
309int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
310 struct ib_mr **, struct ib_sge *);
311int rpcrdma_deregister_internal(struct rpcrdma_ia *,
312 struct ib_mr *, struct ib_sge *);
313
314int rpcrdma_register_external(struct rpcrdma_mr_seg *,
315 int, int, struct rpcrdma_xprt *);
316int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
317 struct rpcrdma_xprt *, void *);
318
319/*
320 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
321 */
322void rpcrdma_conn_func(struct rpcrdma_ep *);
323void rpcrdma_reply_handler(struct rpcrdma_rep *);
324
325/*
326 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
327 */
328int rpcrdma_marshal_req(struct rpc_rqst *);
329
330#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */