aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
author\"Talpey, Thomas\ <Thomas.Talpey@netapp.com>2007-09-10 13:50:12 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2007-10-09 17:18:03 -0400
commitf58851e6b0f148fb4b2a1c6f70beb2f125863c0f (patch)
tree816604d59b5de0ee19ca69b19a6f34548dbb82c5
parent2cf7ff7a37cc149bd59c4f3bad432f686a4619c8 (diff)
RPCRDMA: rpc rdma transport switch
This implements the configuration and building of the core transport switch implementation of the rpcrdma transport. Stubs are provided for the rpcrdma protocol handling, and the infiniband/iwarp verbs interface. These are provided in following patches. Signed-off-by: Tom Talpey <talpey@netapp.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
-rw-r--r--include/linux/sunrpc/debug.h5
-rw-r--r--net/sunrpc/Makefile1
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c9
-rw-r--r--net/sunrpc/xprtrdma/transport.c800
-rw-r--r--net/sunrpc/xprtrdma/verbs.c37
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h330
7 files changed, 1185 insertions, 0 deletions
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 3912cf16361e..3347c72b848a 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -88,6 +88,11 @@ enum {
88 CTL_SLOTTABLE_TCP, 88 CTL_SLOTTABLE_TCP,
89 CTL_MIN_RESVPORT, 89 CTL_MIN_RESVPORT,
90 CTL_MAX_RESVPORT, 90 CTL_MAX_RESVPORT,
91 CTL_SLOTTABLE_RDMA,
92 CTL_RDMA_MAXINLINEREAD,
93 CTL_RDMA_MAXINLINEWRITE,
94 CTL_RDMA_WRITEPADDING,
95 CTL_RDMA_MEMREG,
91}; 96};
92 97
93#endif /* _LINUX_SUNRPC_DEBUG_H_ */ 98#endif /* _LINUX_SUNRPC_DEBUG_H_ */
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8ebfc4db7f51..5c69a725e530 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,6 +5,7 @@
5 5
6obj-$(CONFIG_SUNRPC) += sunrpc.o 6obj-$(CONFIG_SUNRPC) += sunrpc.o
7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ 7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
8obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
8 9
9sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ 10sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
10 auth.o auth_null.o auth_unix.o \ 11 auth.o auth_null.o auth_unix.o \
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 000000000000..264f0feeb513
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 000000000000..b0587f3a5d77
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,9 @@
1/*
2 * Placeholders for subsequent patches
3 */
4
5#include "xprt_rdma.h"
6
7void rpcrdma_conn_func(struct rpcrdma_ep *a) { }
8void rpcrdma_reply_handler(struct rpcrdma_rep *a) { }
9int rpcrdma_marshal_req(struct rpc_rqst *a) { return EINVAL; }
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 000000000000..dc55cc974c90
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,800 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40/*
41 * transport.c
42 *
43 * This file contains the top-level implementation of an RPC RDMA
44 * transport.
45 *
46 * Naming convention: functions beginning with xprt_ are part of the
47 * transport switch. All others are RPC RDMA internal.
48 */
49
50#include <linux/module.h>
51#include <linux/init.h>
52#include <linux/seq_file.h>
53
54#include "xprt_rdma.h"
55
56#ifdef RPC_DEBUG
57# define RPCDBG_FACILITY RPCDBG_TRANS
58#endif
59
60MODULE_LICENSE("Dual BSD/GPL");
61
62MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
63MODULE_AUTHOR("Network Appliance, Inc.");
64
65/*
66 * tunables
67 */
68
69static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
70static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
71static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
72static unsigned int xprt_rdma_inline_write_padding;
73#if !RPCRDMA_PERSISTENT_REGISTRATION
74static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */
75#else
76static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL;
77#endif
78
79#ifdef RPC_DEBUG
80
81static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
82static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
83static unsigned int zero;
84static unsigned int max_padding = PAGE_SIZE;
85static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
86static unsigned int max_memreg = RPCRDMA_LAST - 1;
87
88static struct ctl_table_header *sunrpc_table_header;
89
90static ctl_table xr_tunables_table[] = {
91 {
92 .ctl_name = CTL_SLOTTABLE_RDMA,
93 .procname = "rdma_slot_table_entries",
94 .data = &xprt_rdma_slot_table_entries,
95 .maxlen = sizeof(unsigned int),
96 .mode = 0644,
97 .proc_handler = &proc_dointvec_minmax,
98 .strategy = &sysctl_intvec,
99 .extra1 = &min_slot_table_size,
100 .extra2 = &max_slot_table_size
101 },
102 {
103 .ctl_name = CTL_RDMA_MAXINLINEREAD,
104 .procname = "rdma_max_inline_read",
105 .data = &xprt_rdma_max_inline_read,
106 .maxlen = sizeof(unsigned int),
107 .mode = 0644,
108 .proc_handler = &proc_dointvec,
109 .strategy = &sysctl_intvec,
110 },
111 {
112 .ctl_name = CTL_RDMA_MAXINLINEWRITE,
113 .procname = "rdma_max_inline_write",
114 .data = &xprt_rdma_max_inline_write,
115 .maxlen = sizeof(unsigned int),
116 .mode = 0644,
117 .proc_handler = &proc_dointvec,
118 .strategy = &sysctl_intvec,
119 },
120 {
121 .ctl_name = CTL_RDMA_WRITEPADDING,
122 .procname = "rdma_inline_write_padding",
123 .data = &xprt_rdma_inline_write_padding,
124 .maxlen = sizeof(unsigned int),
125 .mode = 0644,
126 .proc_handler = &proc_dointvec_minmax,
127 .strategy = &sysctl_intvec,
128 .extra1 = &zero,
129 .extra2 = &max_padding,
130 },
131 {
132 .ctl_name = CTL_RDMA_MEMREG,
133 .procname = "rdma_memreg_strategy",
134 .data = &xprt_rdma_memreg_strategy,
135 .maxlen = sizeof(unsigned int),
136 .mode = 0644,
137 .proc_handler = &proc_dointvec_minmax,
138 .strategy = &sysctl_intvec,
139 .extra1 = &min_memreg,
140 .extra2 = &max_memreg,
141 },
142 {
143 .ctl_name = 0,
144 },
145};
146
147static ctl_table sunrpc_table[] = {
148 {
149 .ctl_name = CTL_SUNRPC,
150 .procname = "sunrpc",
151 .mode = 0555,
152 .child = xr_tunables_table
153 },
154 {
155 .ctl_name = 0,
156 },
157};
158
159#endif
160
161static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
162
163static void
164xprt_rdma_format_addresses(struct rpc_xprt *xprt)
165{
166 struct sockaddr_in *addr = (struct sockaddr_in *)
167 &rpcx_to_rdmad(xprt).addr;
168 char *buf;
169
170 buf = kzalloc(20, GFP_KERNEL);
171 if (buf)
172 snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr));
173 xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
174
175 buf = kzalloc(8, GFP_KERNEL);
176 if (buf)
177 snprintf(buf, 8, "%u", ntohs(addr->sin_port));
178 xprt->address_strings[RPC_DISPLAY_PORT] = buf;
179
180 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
181
182 buf = kzalloc(48, GFP_KERNEL);
183 if (buf)
184 snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
185 NIPQUAD(addr->sin_addr.s_addr),
186 ntohs(addr->sin_port), "rdma");
187 xprt->address_strings[RPC_DISPLAY_ALL] = buf;
188
189 buf = kzalloc(10, GFP_KERNEL);
190 if (buf)
191 snprintf(buf, 10, "%02x%02x%02x%02x",
192 NIPQUAD(addr->sin_addr.s_addr));
193 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf;
194
195 buf = kzalloc(8, GFP_KERNEL);
196 if (buf)
197 snprintf(buf, 8, "%4hx", ntohs(addr->sin_port));
198 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf;
199
200 buf = kzalloc(30, GFP_KERNEL);
201 if (buf)
202 snprintf(buf, 30, NIPQUAD_FMT".%u.%u",
203 NIPQUAD(addr->sin_addr.s_addr),
204 ntohs(addr->sin_port) >> 8,
205 ntohs(addr->sin_port) & 0xff);
206 xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
207
208 /* netid */
209 xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
210}
211
212static void
213xprt_rdma_free_addresses(struct rpc_xprt *xprt)
214{
215 kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
216 kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
217 kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
218 kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]);
219 kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
220 kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]);
221}
222
223static void
224xprt_rdma_connect_worker(struct work_struct *work)
225{
226 struct rpcrdma_xprt *r_xprt =
227 container_of(work, struct rpcrdma_xprt, rdma_connect.work);
228 struct rpc_xprt *xprt = &r_xprt->xprt;
229 int rc = 0;
230
231 if (!xprt->shutdown) {
232 xprt_clear_connected(xprt);
233
234 dprintk("RPC: %s: %sconnect\n", __func__,
235 r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
236 rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
237 if (rc)
238 goto out;
239 }
240 goto out_clear;
241
242out:
243 xprt_wake_pending_tasks(xprt, rc);
244
245out_clear:
246 dprintk("RPC: %s: exit\n", __func__);
247 xprt_clear_connecting(xprt);
248}
249
250/*
251 * xprt_rdma_destroy
252 *
253 * Destroy the xprt.
254 * Free all memory associated with the object, including its own.
255 * NOTE: none of the *destroy methods free memory for their top-level
256 * objects, even though they may have allocated it (they do free
257 * private memory). It's up to the caller to handle it. In this
258 * case (RDMA transport), all structure memory is inlined with the
259 * struct rpcrdma_xprt.
260 */
261static void
262xprt_rdma_destroy(struct rpc_xprt *xprt)
263{
264 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
265 int rc;
266
267 dprintk("RPC: %s: called\n", __func__);
268
269 cancel_delayed_work(&r_xprt->rdma_connect);
270 flush_scheduled_work();
271
272 xprt_clear_connected(xprt);
273
274 rpcrdma_buffer_destroy(&r_xprt->rx_buf);
275 rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
276 if (rc)
277 dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
278 __func__, rc);
279 rpcrdma_ia_close(&r_xprt->rx_ia);
280
281 xprt_rdma_free_addresses(xprt);
282
283 kfree(xprt->slot);
284 xprt->slot = NULL;
285 kfree(xprt);
286
287 dprintk("RPC: %s: returning\n", __func__);
288
289 module_put(THIS_MODULE);
290}
291
292/**
293 * xprt_setup_rdma - Set up transport to use RDMA
294 *
295 * @args: rpc transport arguments
296 */
297static struct rpc_xprt *
298xprt_setup_rdma(struct xprt_create *args)
299{
300 struct rpcrdma_create_data_internal cdata;
301 struct rpc_xprt *xprt;
302 struct rpcrdma_xprt *new_xprt;
303 struct rpcrdma_ep *new_ep;
304 struct sockaddr_in *sin;
305 int rc;
306
307 if (args->addrlen > sizeof(xprt->addr)) {
308 dprintk("RPC: %s: address too large\n", __func__);
309 return ERR_PTR(-EBADF);
310 }
311
312 xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL);
313 if (xprt == NULL) {
314 dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
315 __func__);
316 return ERR_PTR(-ENOMEM);
317 }
318
319 xprt->max_reqs = xprt_rdma_slot_table_entries;
320 xprt->slot = kcalloc(xprt->max_reqs,
321 sizeof(struct rpc_rqst), GFP_KERNEL);
322 if (xprt->slot == NULL) {
323 kfree(xprt);
324 dprintk("RPC: %s: couldn't allocate %d slots\n",
325 __func__, xprt->max_reqs);
326 return ERR_PTR(-ENOMEM);
327 }
328
329 /* 60 second timeout, no retries */
330 xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ);
331 xprt->bind_timeout = (60U * HZ);
332 xprt->connect_timeout = (60U * HZ);
333 xprt->reestablish_timeout = (5U * HZ);
334 xprt->idle_timeout = (5U * 60 * HZ);
335
336 xprt->resvport = 0; /* privileged port not needed */
337 xprt->tsh_size = 0; /* RPC-RDMA handles framing */
338 xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
339 xprt->ops = &xprt_rdma_procs;
340
341 /*
342 * Set up RDMA-specific connect data.
343 */
344
345 /* Put server RDMA address in local cdata */
346 memcpy(&cdata.addr, args->dstaddr, args->addrlen);
347
348 /* Ensure xprt->addr holds valid server TCP (not RDMA)
349 * address, for any side protocols which peek at it */
350 xprt->prot = IPPROTO_TCP;
351 xprt->addrlen = args->addrlen;
352 memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
353
354 sin = (struct sockaddr_in *)&cdata.addr;
355 if (ntohs(sin->sin_port) != 0)
356 xprt_set_bound(xprt);
357
358 dprintk("RPC: %s: %u.%u.%u.%u:%u\n", __func__,
359 NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
360
361 /* Set max requests */
362 cdata.max_requests = xprt->max_reqs;
363
364 /* Set some length limits */
365 cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
366 cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
367
368 cdata.inline_wsize = xprt_rdma_max_inline_write;
369 if (cdata.inline_wsize > cdata.wsize)
370 cdata.inline_wsize = cdata.wsize;
371
372 cdata.inline_rsize = xprt_rdma_max_inline_read;
373 if (cdata.inline_rsize > cdata.rsize)
374 cdata.inline_rsize = cdata.rsize;
375
376 cdata.padding = xprt_rdma_inline_write_padding;
377
378 /*
379 * Create new transport instance, which includes initialized
380 * o ia
381 * o endpoint
382 * o buffers
383 */
384
385 new_xprt = rpcx_to_rdmax(xprt);
386
387 rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
388 xprt_rdma_memreg_strategy);
389 if (rc)
390 goto out1;
391
392 /*
393 * initialize and create ep
394 */
395 new_xprt->rx_data = cdata;
396 new_ep = &new_xprt->rx_ep;
397 new_ep->rep_remote_addr = cdata.addr;
398
399 rc = rpcrdma_ep_create(&new_xprt->rx_ep,
400 &new_xprt->rx_ia, &new_xprt->rx_data);
401 if (rc)
402 goto out2;
403
404 /*
405 * Allocate pre-registered send and receive buffers for headers and
406 * any inline data. Also specify any padding which will be provided
407 * from a preregistered zero buffer.
408 */
409 rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
410 &new_xprt->rx_data);
411 if (rc)
412 goto out3;
413
414 /*
415 * Register a callback for connection events. This is necessary because
416 * connection loss notification is async. We also catch connection loss
417 * when reaping receives.
418 */
419 INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
420 new_ep->rep_func = rpcrdma_conn_func;
421 new_ep->rep_xprt = xprt;
422
423 xprt_rdma_format_addresses(xprt);
424
425 if (!try_module_get(THIS_MODULE))
426 goto out4;
427
428 return xprt;
429
430out4:
431 xprt_rdma_free_addresses(xprt);
432 rc = -EINVAL;
433out3:
434 (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
435out2:
436 rpcrdma_ia_close(&new_xprt->rx_ia);
437out1:
438 kfree(xprt->slot);
439 kfree(xprt);
440 return ERR_PTR(rc);
441}
442
443/*
444 * Close a connection, during shutdown or timeout/reconnect
445 */
446static void
447xprt_rdma_close(struct rpc_xprt *xprt)
448{
449 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
450
451 dprintk("RPC: %s: closing\n", __func__);
452 xprt_disconnect(xprt);
453 (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
454}
455
456static void
457xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
458{
459 struct sockaddr_in *sap;
460
461 sap = (struct sockaddr_in *)&xprt->addr;
462 sap->sin_port = htons(port);
463 sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
464 sap->sin_port = htons(port);
465 dprintk("RPC: %s: %u\n", __func__, port);
466}
467
468static void
469xprt_rdma_connect(struct rpc_task *task)
470{
471 struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt;
472 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
473
474 if (!xprt_test_and_set_connecting(xprt)) {
475 if (r_xprt->rx_ep.rep_connected != 0) {
476 /* Reconnect */
477 schedule_delayed_work(&r_xprt->rdma_connect,
478 xprt->reestablish_timeout);
479 } else {
480 schedule_delayed_work(&r_xprt->rdma_connect, 0);
481 if (!RPC_IS_ASYNC(task))
482 flush_scheduled_work();
483 }
484 }
485}
486
487static int
488xprt_rdma_reserve_xprt(struct rpc_task *task)
489{
490 struct rpc_xprt *xprt = task->tk_xprt;
491 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
492 int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
493
494 /* == RPC_CWNDSCALE @ init, but *after* setup */
495 if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
496 r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
497 dprintk("RPC: %s: cwndscale %lu\n", __func__,
498 r_xprt->rx_buf.rb_cwndscale);
499 BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
500 }
501 xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
502 return xprt_reserve_xprt_cong(task);
503}
504
505/*
506 * The RDMA allocate/free functions need the task structure as a place
507 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
508 * sequence. For this reason, the recv buffers are attached to send
509 * buffers for portions of the RPC. Note that the RPC layer allocates
510 * both send and receive buffers in the same call. We may register
511 * the receive buffer portion when using reply chunks.
512 */
513static void *
514xprt_rdma_allocate(struct rpc_task *task, size_t size)
515{
516 struct rpc_xprt *xprt = task->tk_xprt;
517 struct rpcrdma_req *req, *nreq;
518
519 req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
520 BUG_ON(NULL == req);
521
522 if (size > req->rl_size) {
523 dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
524 "prog %d vers %d proc %d\n",
525 __func__, size, req->rl_size,
526 task->tk_client->cl_prog, task->tk_client->cl_vers,
527 task->tk_msg.rpc_proc->p_proc);
528 /*
529 * Outgoing length shortage. Our inline write max must have
530 * been configured to perform direct i/o.
531 *
532 * This is therefore a large metadata operation, and the
533 * allocate call was made on the maximum possible message,
534 * e.g. containing long filename(s) or symlink data. In
535 * fact, while these metadata operations *might* carry
536 * large outgoing payloads, they rarely *do*. However, we
537 * have to commit to the request here, so reallocate and
538 * register it now. The data path will never require this
539 * reallocation.
540 *
541 * If the allocation or registration fails, the RPC framework
542 * will (doggedly) retry.
543 */
544 if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
545 RPCRDMA_BOUNCEBUFFERS) {
546 /* forced to "pure inline" */
547 dprintk("RPC: %s: too much data (%zd) for inline "
548 "(r/w max %d/%d)\n", __func__, size,
549 rpcx_to_rdmad(xprt).inline_rsize,
550 rpcx_to_rdmad(xprt).inline_wsize);
551 size = req->rl_size;
552 rpc_exit(task, -EIO); /* fail the operation */
553 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
554 goto out;
555 }
556 if (task->tk_flags & RPC_TASK_SWAPPER)
557 nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
558 else
559 nreq = kmalloc(sizeof *req + size, GFP_NOFS);
560 if (nreq == NULL)
561 goto outfail;
562
563 if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
564 nreq->rl_base, size + sizeof(struct rpcrdma_req)
565 - offsetof(struct rpcrdma_req, rl_base),
566 &nreq->rl_handle, &nreq->rl_iov)) {
567 kfree(nreq);
568 goto outfail;
569 }
570 rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
571 nreq->rl_size = size;
572 nreq->rl_niovs = 0;
573 nreq->rl_nchunks = 0;
574 nreq->rl_buffer = (struct rpcrdma_buffer *)req;
575 nreq->rl_reply = req->rl_reply;
576 memcpy(nreq->rl_segments,
577 req->rl_segments, sizeof nreq->rl_segments);
578 /* flag the swap with an unused field */
579 nreq->rl_iov.length = 0;
580 req->rl_reply = NULL;
581 req = nreq;
582 }
583 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
584out:
585 return req->rl_xdr_buf;
586
587outfail:
588 rpcrdma_buffer_put(req);
589 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
590 return NULL;
591}
592
593/*
594 * This function returns all RDMA resources to the pool.
595 */
596static void
597xprt_rdma_free(void *buffer)
598{
599 struct rpcrdma_req *req;
600 struct rpcrdma_xprt *r_xprt;
601 struct rpcrdma_rep *rep;
602 int i;
603
604 if (buffer == NULL)
605 return;
606
607 req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
608 r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
609 rep = req->rl_reply;
610
611 dprintk("RPC: %s: called on 0x%p%s\n",
612 __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
613
614 /*
615 * Finish the deregistration. When using mw bind, this was
616 * begun in rpcrdma_reply_handler(). In all other modes, we
617 * do it here, in thread context. The process is considered
618 * complete when the rr_func vector becomes NULL - this
619 * was put in place during rpcrdma_reply_handler() - the wait
620 * call below will not block if the dereg is "done". If
621 * interrupted, our framework will clean up.
622 */
623 for (i = 0; req->rl_nchunks;) {
624 --req->rl_nchunks;
625 i += rpcrdma_deregister_external(
626 &req->rl_segments[i], r_xprt, NULL);
627 }
628
629 if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
630 rep->rr_func = NULL; /* abandon the callback */
631 req->rl_reply = NULL;
632 }
633
634 if (req->rl_iov.length == 0) { /* see allocate above */
635 struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
636 oreq->rl_reply = req->rl_reply;
637 (void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
638 req->rl_handle,
639 &req->rl_iov);
640 kfree(req);
641 req = oreq;
642 }
643
644 /* Put back request+reply buffers */
645 rpcrdma_buffer_put(req);
646}
647
648/*
649 * send_request invokes the meat of RPC RDMA. It must do the following:
650 * 1. Marshal the RPC request into an RPC RDMA request, which means
651 * putting a header in front of data, and creating IOVs for RDMA
652 * from those in the request.
653 * 2. In marshaling, detect opportunities for RDMA, and use them.
654 * 3. Post a recv message to set up asynch completion, then send
655 * the request (rpcrdma_ep_post).
656 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
657 */
658
659static int
660xprt_rdma_send_request(struct rpc_task *task)
661{
662 struct rpc_rqst *rqst = task->tk_rqstp;
663 struct rpc_xprt *xprt = task->tk_xprt;
664 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
665 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
666
667 /* marshal the send itself */
668 if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
669 r_xprt->rx_stats.failed_marshal_count++;
670 dprintk("RPC: %s: rpcrdma_marshal_req failed\n",
671 __func__);
672 return -EIO;
673 }
674
675 if (req->rl_reply == NULL) /* e.g. reconnection */
676 rpcrdma_recv_buffer_get(req);
677
678 if (req->rl_reply) {
679 req->rl_reply->rr_func = rpcrdma_reply_handler;
680 /* this need only be done once, but... */
681 req->rl_reply->rr_xprt = xprt;
682 }
683
684 if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
685 xprt_disconnect(xprt);
686 return -ENOTCONN; /* implies disconnect */
687 }
688
689 rqst->rq_bytes_sent = 0;
690 return 0;
691}
692
693static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
694{
695 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
696 long idle_time = 0;
697
698 if (xprt_connected(xprt))
699 idle_time = (long)(jiffies - xprt->last_used) / HZ;
700
701 seq_printf(seq,
702 "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
703 "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
704
705 0, /* need a local port? */
706 xprt->stat.bind_count,
707 xprt->stat.connect_count,
708 xprt->stat.connect_time,
709 idle_time,
710 xprt->stat.sends,
711 xprt->stat.recvs,
712 xprt->stat.bad_xids,
713 xprt->stat.req_u,
714 xprt->stat.bklog_u,
715
716 r_xprt->rx_stats.read_chunk_count,
717 r_xprt->rx_stats.write_chunk_count,
718 r_xprt->rx_stats.reply_chunk_count,
719 r_xprt->rx_stats.total_rdma_request,
720 r_xprt->rx_stats.total_rdma_reply,
721 r_xprt->rx_stats.pullup_copy_count,
722 r_xprt->rx_stats.fixup_copy_count,
723 r_xprt->rx_stats.hardway_register_count,
724 r_xprt->rx_stats.failed_marshal_count,
725 r_xprt->rx_stats.bad_reply_count);
726}
727
728/*
729 * Plumbing for rpc transport switch and kernel module
730 */
731
732static struct rpc_xprt_ops xprt_rdma_procs = {
733 .reserve_xprt = xprt_rdma_reserve_xprt,
734 .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
735 .release_request = xprt_release_rqst_cong, /* ditto */
736 .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
737 .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
738 .set_port = xprt_rdma_set_port,
739 .connect = xprt_rdma_connect,
740 .buf_alloc = xprt_rdma_allocate,
741 .buf_free = xprt_rdma_free,
742 .send_request = xprt_rdma_send_request,
743 .close = xprt_rdma_close,
744 .destroy = xprt_rdma_destroy,
745 .print_stats = xprt_rdma_print_stats
746};
747
748static struct xprt_class xprt_rdma = {
749 .list = LIST_HEAD_INIT(xprt_rdma.list),
750 .name = "rdma",
751 .owner = THIS_MODULE,
752 .ident = XPRT_TRANSPORT_RDMA,
753 .setup = xprt_setup_rdma,
754};
755
756static void __exit xprt_rdma_cleanup(void)
757{
758 int rc;
759
760 dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
761#ifdef RPC_DEBUG
762 if (sunrpc_table_header) {
763 unregister_sysctl_table(sunrpc_table_header);
764 sunrpc_table_header = NULL;
765 }
766#endif
767 rc = xprt_unregister_transport(&xprt_rdma);
768 if (rc)
769 dprintk("RPC: %s: xprt_unregister returned %i\n",
770 __func__, rc);
771}
772
773static int __init xprt_rdma_init(void)
774{
775 int rc;
776
777 rc = xprt_register_transport(&xprt_rdma);
778
779 if (rc)
780 return rc;
781
782 dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
783
784 dprintk(KERN_INFO "Defaults:\n");
785 dprintk(KERN_INFO "\tSlots %d\n"
786 "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
787 xprt_rdma_slot_table_entries,
788 xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
789 dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
790 xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
791
792#ifdef RPC_DEBUG
793 if (!sunrpc_table_header)
794 sunrpc_table_header = register_sysctl_table(sunrpc_table);
795#endif
796 return 0;
797}
798
799module_init(xprt_rdma_init);
800module_exit(xprt_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 000000000000..0baf53381987
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,37 @@
1/*
2 * Placeholders for subsequent patches
3 */
4
5#include "xprt_rdma.h"
6
7int rpcrdma_ia_open(struct rpcrdma_xprt *a, struct sockaddr *b, int c)
8{ return EINVAL; }
9void rpcrdma_ia_close(struct rpcrdma_ia *a) { }
10int rpcrdma_ep_create(struct rpcrdma_ep *a, struct rpcrdma_ia *b,
11struct rpcrdma_create_data_internal *c) { return EINVAL; }
12int rpcrdma_ep_destroy(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
13{ return EINVAL; }
14int rpcrdma_ep_connect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
15{ return EINVAL; }
16int rpcrdma_ep_disconnect(struct rpcrdma_ep *a, struct rpcrdma_ia *b)
17{ return EINVAL; }
18int rpcrdma_ep_post(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
19struct rpcrdma_req *c) { return EINVAL; }
20int rpcrdma_ep_post_recv(struct rpcrdma_ia *a, struct rpcrdma_ep *b,
21struct rpcrdma_rep *c) { return EINVAL; }
22int rpcrdma_buffer_create(struct rpcrdma_buffer *a, struct rpcrdma_ep *b,
23struct rpcrdma_ia *c, struct rpcrdma_create_data_internal *d) { return EINVAL; }
24void rpcrdma_buffer_destroy(struct rpcrdma_buffer *a) { }
25struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *a)
26{ return NULL; }
27void rpcrdma_buffer_put(struct rpcrdma_req *a) { }
28void rpcrdma_recv_buffer_get(struct rpcrdma_req *a) { }
29void rpcrdma_recv_buffer_put(struct rpcrdma_rep *a) { }
30int rpcrdma_register_internal(struct rpcrdma_ia *a, void *b, int c,
31struct ib_mr **d, struct ib_sge *e) { return EINVAL; }
32int rpcrdma_deregister_internal(struct rpcrdma_ia *a, struct ib_mr *b,
33struct ib_sge *c) { return EINVAL; }
34int rpcrdma_register_external(struct rpcrdma_mr_seg *a, int b, int c,
35struct rpcrdma_xprt *d) { return EINVAL; }
36int rpcrdma_deregister_external(struct rpcrdma_mr_seg *a,
37struct rpcrdma_xprt *b, void *c) { return EINVAL; }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 000000000000..2427822f8bd4
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,330 @@
1/*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
41#define _LINUX_SUNRPC_XPRT_RDMA_H
42
43#include <linux/wait.h> /* wait_queue_head_t, etc */
44#include <linux/spinlock.h> /* spinlock_t, etc */
45#include <asm/atomic.h> /* atomic_t, etc */
46
47#include <rdma/rdma_cm.h> /* RDMA connection api */
48#include <rdma/ib_verbs.h> /* RDMA verbs api */
49
50#include <linux/sunrpc/clnt.h> /* rpc_xprt */
51#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
52#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
53
54/*
55 * Interface Adapter -- one per transport instance
56 */
57struct rpcrdma_ia {
58 struct rdma_cm_id *ri_id;
59 struct ib_pd *ri_pd;
60 struct ib_mr *ri_bind_mem;
61 struct completion ri_done;
62 int ri_async_rc;
63 enum rpcrdma_memreg ri_memreg_strategy;
64};
65
66/*
67 * RDMA Endpoint -- one per transport instance
68 */
69
70struct rpcrdma_ep {
71 atomic_t rep_cqcount;
72 int rep_cqinit;
73 int rep_connected;
74 struct rpcrdma_ia *rep_ia;
75 struct ib_cq *rep_cq;
76 struct ib_qp_init_attr rep_attr;
77 wait_queue_head_t rep_connect_wait;
78 struct ib_sge rep_pad; /* holds zeroed pad */
79 struct ib_mr *rep_pad_mr; /* holds zeroed pad */
80 void (*rep_func)(struct rpcrdma_ep *);
81 struct rpc_xprt *rep_xprt; /* for rep_func */
82 struct rdma_conn_param rep_remote_cma;
83 struct sockaddr_storage rep_remote_addr;
84};
85
86#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
87#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
88
89/*
90 * struct rpcrdma_rep -- this structure encapsulates state required to recv
91 * and complete a reply, asychronously. It needs several pieces of
92 * state:
93 * o recv buffer (posted to provider)
94 * o ib_sge (also donated to provider)
95 * o status of reply (length, success or not)
96 * o bookkeeping state to get run by tasklet (list, etc)
97 *
98 * These are allocated during initialization, per-transport instance;
99 * however, the tasklet execution list itself is global, as it should
100 * always be pretty short.
101 *
102 * N of these are associated with a transport instance, and stored in
103 * struct rpcrdma_buffer. N is the max number of outstanding requests.
104 */
105
106/* temporary static scatter/gather max */
107#define RPCRDMA_MAX_DATA_SEGS (8) /* max scatter/gather */
108#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
109#define MAX_RPCRDMAHDR (\
110 /* max supported RPC/RDMA header */ \
111 sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
112 (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
113
114struct rpcrdma_buffer;
115
116struct rpcrdma_rep {
117 unsigned int rr_len; /* actual received reply length */
118 struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
119 struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
120 void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
121 struct list_head rr_list; /* tasklet list */
122 wait_queue_head_t rr_unbind; /* optional unbind wait */
123 struct ib_sge rr_iov; /* for posting */
124 struct ib_mr *rr_handle; /* handle for mem in rr_iov */
125 char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
126};
127
128/*
129 * struct rpcrdma_req -- structure central to the request/reply sequence.
130 *
131 * N of these are associated with a transport instance, and stored in
132 * struct rpcrdma_buffer. N is the max number of outstanding requests.
133 *
134 * It includes pre-registered buffer memory for send AND recv.
135 * The recv buffer, however, is not owned by this structure, and
136 * is "donated" to the hardware when a recv is posted. When a
137 * reply is handled, the recv buffer used is given back to the
138 * struct rpcrdma_req associated with the request.
139 *
140 * In addition to the basic memory, this structure includes an array
141 * of iovs for send operations. The reason is that the iovs passed to
142 * ib_post_{send,recv} must not be modified until the work request
143 * completes.
144 *
145 * NOTES:
146 * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
147 * marshal. The number needed varies depending on the iov lists that
148 * are passed to us, the memory registration mode we are in, and if
149 * physical addressing is used, the layout.
150 */
151
152struct rpcrdma_mr_seg { /* chunk descriptors */
153 union { /* chunk memory handles */
154 struct ib_mr *rl_mr; /* if registered directly */
155 struct rpcrdma_mw { /* if registered from region */
156 union {
157 struct ib_mw *mw;
158 struct ib_fmr *fmr;
159 } r;
160 struct list_head mw_list;
161 } *rl_mw;
162 } mr_chunk;
163 u64 mr_base; /* registration result */
164 u32 mr_rkey; /* registration result */
165 u32 mr_len; /* length of chunk or segment */
166 int mr_nsegs; /* number of segments in chunk or 0 */
167 enum dma_data_direction mr_dir; /* segment mapping direction */
168 dma_addr_t mr_dma; /* segment mapping address */
169 size_t mr_dmalen; /* segment mapping length */
170 struct page *mr_page; /* owning page, if any */
171 char *mr_offset; /* kva if no page, else offset */
172};
173
174struct rpcrdma_req {
175 size_t rl_size; /* actual length of buffer */
176 unsigned int rl_niovs; /* 0, 2 or 4 */
177 unsigned int rl_nchunks; /* non-zero if chunks */
178 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
179 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
180 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
181 struct ib_sge rl_send_iov[4]; /* for active requests */
182 struct ib_sge rl_iov; /* for posting */
183 struct ib_mr *rl_handle; /* handle for mem in rl_iov */
184 char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
185 __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */
186};
187#define rpcr_to_rdmar(r) \
188 container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
189
190/*
191 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
192 * inline requests/replies, and client/server credits.
193 *
194 * One of these is associated with a transport instance
195 */
196struct rpcrdma_buffer {
197 spinlock_t rb_lock; /* protects indexes */
198 atomic_t rb_credits; /* most recent server credits */
199 unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
200 int rb_max_requests;/* client max requests */
201 struct list_head rb_mws; /* optional memory windows/fmrs */
202 int rb_send_index;
203 struct rpcrdma_req **rb_send_bufs;
204 int rb_recv_index;
205 struct rpcrdma_rep **rb_recv_bufs;
206 char *rb_pool;
207};
208#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
209
210/*
211 * Internal structure for transport instance creation. This
212 * exists primarily for modularity.
213 *
214 * This data should be set with mount options
215 */
216struct rpcrdma_create_data_internal {
217 struct sockaddr_storage addr; /* RDMA server address */
218 unsigned int max_requests; /* max requests (slots) in flight */
219 unsigned int rsize; /* mount rsize - max read hdr+data */
220 unsigned int wsize; /* mount wsize - max write hdr+data */
221 unsigned int inline_rsize; /* max non-rdma read data payload */
222 unsigned int inline_wsize; /* max non-rdma write data payload */
223 unsigned int padding; /* non-rdma write header padding */
224};
225
226#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
227 (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
228
229#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
230 (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
231
232#define RPCRDMA_INLINE_PAD_VALUE(rq)\
233 rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
234
235/*
236 * Statistics for RPCRDMA
237 */
238struct rpcrdma_stats {
239 unsigned long read_chunk_count;
240 unsigned long write_chunk_count;
241 unsigned long reply_chunk_count;
242
243 unsigned long long total_rdma_request;
244 unsigned long long total_rdma_reply;
245
246 unsigned long long pullup_copy_count;
247 unsigned long long fixup_copy_count;
248 unsigned long hardway_register_count;
249 unsigned long failed_marshal_count;
250 unsigned long bad_reply_count;
251};
252
253/*
254 * RPCRDMA transport -- encapsulates the structures above for
255 * integration with RPC.
256 *
257 * The contained structures are embedded, not pointers,
258 * for convenience. This structure need not be visible externally.
259 *
260 * It is allocated and initialized during mount, and released
261 * during unmount.
262 */
263struct rpcrdma_xprt {
264 struct rpc_xprt xprt;
265 struct rpcrdma_ia rx_ia;
266 struct rpcrdma_ep rx_ep;
267 struct rpcrdma_buffer rx_buf;
268 struct rpcrdma_create_data_internal rx_data;
269 struct delayed_work rdma_connect;
270 struct rpcrdma_stats rx_stats;
271};
272
273#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
274#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
275
276/*
277 * Interface Adapter calls - xprtrdma/verbs.c
278 */
279int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
280void rpcrdma_ia_close(struct rpcrdma_ia *);
281
282/*
283 * Endpoint calls - xprtrdma/verbs.c
284 */
285int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
286 struct rpcrdma_create_data_internal *);
287int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
288int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
289int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
290
291int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
292 struct rpcrdma_req *);
293int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
294 struct rpcrdma_rep *);
295
296/*
297 * Buffer calls - xprtrdma/verbs.c
298 */
299int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
300 struct rpcrdma_ia *,
301 struct rpcrdma_create_data_internal *);
302void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
303
304struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
305void rpcrdma_buffer_put(struct rpcrdma_req *);
306void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
307void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
308
309int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
310 struct ib_mr **, struct ib_sge *);
311int rpcrdma_deregister_internal(struct rpcrdma_ia *,
312 struct ib_mr *, struct ib_sge *);
313
314int rpcrdma_register_external(struct rpcrdma_mr_seg *,
315 int, int, struct rpcrdma_xprt *);
316int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
317 struct rpcrdma_xprt *, void *);
318
319/*
320 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
321 */
322void rpcrdma_conn_func(struct rpcrdma_ep *);
323void rpcrdma_reply_handler(struct rpcrdma_rep *);
324
325/*
326 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
327 */
328int rpcrdma_marshal_req(struct rpc_rqst *);
329
330#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */