diff options
author | Tom Tucker <tom@opengridcomputing.com> | 2008-10-22 19:47:39 -0400 |
---|---|---|
committer | Eric Van Hensbergen <ericvh@opteron.(none)> | 2008-10-22 19:47:39 -0400 |
commit | fc79d4b104f0eb8c2a7242150eaf8756ced4c344 (patch) | |
tree | b8e9974570ca194acc3ebd24ba7a864bba80c477 /net/9p | |
parent | ea2e7996fc892e9becfed9145fdcefd59f697718 (diff) |
9p: rdma: RDMA Transport Support for 9P
This patch implements the RDMA transport provider for 9P. It allows
mounts to be performed over iWARP and IB capable network interfaces.
Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Latchesar Ionkov <lionkov@lanl.gov>
Diffstat (limited to 'net/9p')
-rw-r--r-- | net/9p/Kconfig | 6 | ||||
-rw-r--r-- | net/9p/Makefile | 4 | ||||
-rw-r--r-- | net/9p/trans_rdma.c | 712 |
3 files changed, 722 insertions, 0 deletions
diff --git a/net/9p/Kconfig b/net/9p/Kconfig index ff34c5acc130..c42c0c400bf9 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig | |||
@@ -20,6 +20,12 @@ config NET_9P_VIRTIO | |||
20 | This builds support for a transports between | 20 | This builds support for a transports between |
21 | guest partitions and a host partition. | 21 | guest partitions and a host partition. |
22 | 22 | ||
23 | config NET_9P_RDMA | ||
24 | depends on NET_9P && INFINIBAND && EXPERIMENTAL | ||
25 | tristate "9P RDMA Transport (Experimental)" | ||
26 | help | ||
27 | This builds support for a RDMA transport. | ||
28 | |||
23 | config NET_9P_DEBUG | 29 | config NET_9P_DEBUG |
24 | bool "Debug information" | 30 | bool "Debug information" |
25 | depends on NET_9P | 31 | depends on NET_9P |
diff --git a/net/9p/Makefile b/net/9p/Makefile index 1041b7bd12e2..198a640d53a6 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile | |||
@@ -1,5 +1,6 @@ | |||
1 | obj-$(CONFIG_NET_9P) := 9pnet.o | 1 | obj-$(CONFIG_NET_9P) := 9pnet.o |
2 | obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o | 2 | obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o |
3 | obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o | ||
3 | 4 | ||
4 | 9pnet-objs := \ | 5 | 9pnet-objs := \ |
5 | mod.o \ | 6 | mod.o \ |
@@ -11,3 +12,6 @@ obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o | |||
11 | 12 | ||
12 | 9pnet_virtio-objs := \ | 13 | 9pnet_virtio-objs := \ |
13 | trans_virtio.o \ | 14 | trans_virtio.o \ |
15 | |||
16 | 9pnet_rdma-objs := \ | ||
17 | trans_rdma.o \ | ||
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c new file mode 100644 index 000000000000..8d6cc4777aae --- /dev/null +++ b/net/9p/trans_rdma.c | |||
@@ -0,0 +1,712 @@ | |||
1 | /* | ||
2 | * linux/fs/9p/trans_rdma.c | ||
3 | * | ||
4 | * RDMA transport layer based on the trans_fd.c implementation. | ||
5 | * | ||
6 | * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> | ||
7 | * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> | ||
8 | * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> | ||
9 | * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> | ||
10 | * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License version 2 | ||
14 | * as published by the Free Software Foundation. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to: | ||
23 | * Free Software Foundation | ||
24 | * 51 Franklin Street, Fifth Floor | ||
25 | * Boston, MA 02111-1301 USA | ||
26 | * | ||
27 | */ | ||
28 | |||
29 | #include <linux/in.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/net.h> | ||
32 | #include <linux/ipv6.h> | ||
33 | #include <linux/kthread.h> | ||
34 | #include <linux/errno.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/un.h> | ||
37 | #include <linux/uaccess.h> | ||
38 | #include <linux/inet.h> | ||
39 | #include <linux/idr.h> | ||
40 | #include <linux/file.h> | ||
41 | #include <linux/parser.h> | ||
42 | #include <linux/semaphore.h> | ||
43 | #include <net/9p/9p.h> | ||
44 | #include <net/9p/client.h> | ||
45 | #include <net/9p/transport.h> | ||
46 | #include <rdma/ib_verbs.h> | ||
47 | #include <rdma/rdma_cm.h> | ||
48 | #include <rdma/ib_verbs.h> | ||
49 | |||
50 | #define P9_PORT 5640 | ||
51 | #define P9_RDMA_SQ_DEPTH 32 | ||
52 | #define P9_RDMA_RQ_DEPTH 32 | ||
53 | #define P9_RDMA_SEND_SGE 4 | ||
54 | #define P9_RDMA_RECV_SGE 4 | ||
55 | #define P9_RDMA_IRD 0 | ||
56 | #define P9_RDMA_ORD 0 | ||
57 | #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ | ||
58 | #define P9_RDMA_MAXSIZE (4*4096) /* Min SGE is 4, so we can | ||
59 | * safely advertise a maxsize | ||
60 | * of 64k */ | ||
61 | |||
62 | #define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT) | ||
63 | /** | ||
64 | * struct p9_trans_rdma - RDMA transport instance | ||
65 | * | ||
66 | * @state: tracks the transport state machine for connection setup and tear down | ||
67 | * @cm_id: The RDMA CM ID | ||
68 | * @pd: Protection Domain pointer | ||
69 | * @qp: Queue Pair pointer | ||
70 | * @cq: Completion Queue pointer | ||
71 | * @lkey: The local access only memory region key | ||
72 | * @timeout: Number of uSecs to wait for connection management events | ||
73 | * @sq_depth: The depth of the Send Queue | ||
74 | * @sq_sem: Semaphore for the SQ | ||
75 | * @rq_depth: The depth of the Receive Queue. | ||
76 | * @addr: The remote peer's address | ||
77 | * @req_lock: Protects the active request list | ||
78 | * @send_wait: Wait list when the SQ fills up | ||
79 | * @cm_done: Completion event for connection management tracking | ||
80 | */ | ||
81 | struct p9_trans_rdma { | ||
82 | enum { | ||
83 | P9_RDMA_INIT, | ||
84 | P9_RDMA_ADDR_RESOLVED, | ||
85 | P9_RDMA_ROUTE_RESOLVED, | ||
86 | P9_RDMA_CONNECTED, | ||
87 | P9_RDMA_FLUSHING, | ||
88 | P9_RDMA_CLOSING, | ||
89 | P9_RDMA_CLOSED, | ||
90 | } state; | ||
91 | struct rdma_cm_id *cm_id; | ||
92 | struct ib_pd *pd; | ||
93 | struct ib_qp *qp; | ||
94 | struct ib_cq *cq; | ||
95 | struct ib_mr *dma_mr; | ||
96 | u32 lkey; | ||
97 | long timeout; | ||
98 | int sq_depth; | ||
99 | struct semaphore sq_sem; | ||
100 | int rq_depth; | ||
101 | atomic_t rq_count; | ||
102 | struct sockaddr_in addr; | ||
103 | spinlock_t req_lock; | ||
104 | |||
105 | struct completion cm_done; | ||
106 | }; | ||
107 | |||
108 | /** | ||
109 | * p9_rdma_context - Keeps track of in-process WR | ||
110 | * | ||
111 | * @wc_op: The original WR op for when the CQE completes in error. | ||
112 | * @busa: Bus address to unmap when the WR completes | ||
113 | * @req: Keeps track of requests (send) | ||
114 | * @rc: Keepts track of replies (receive) | ||
115 | */ | ||
116 | struct p9_rdma_req; | ||
117 | struct p9_rdma_context { | ||
118 | enum ib_wc_opcode wc_op; | ||
119 | dma_addr_t busa; | ||
120 | union { | ||
121 | struct p9_req_t *req; | ||
122 | struct p9_fcall *rc; | ||
123 | }; | ||
124 | }; | ||
125 | |||
126 | /** | ||
127 | * p9_rdma_opts - Collection of mount options | ||
128 | * @port: port of connection | ||
129 | * @sq_depth: The requested depth of the SQ. This really doesn't need | ||
130 | * to be any deeper than the number of threads used in the client | ||
131 | * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth | ||
132 | * @timeout: Time to wait in msecs for CM events | ||
133 | */ | ||
134 | struct p9_rdma_opts { | ||
135 | short port; | ||
136 | int sq_depth; | ||
137 | int rq_depth; | ||
138 | long timeout; | ||
139 | }; | ||
140 | |||
141 | /* | ||
142 | * Option Parsing (code inspired by NFS code) | ||
143 | */ | ||
144 | enum { | ||
145 | /* Options that take integer arguments */ | ||
146 | Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, | ||
147 | }; | ||
148 | |||
149 | static match_table_t tokens = { | ||
150 | {Opt_port, "port=%u"}, | ||
151 | {Opt_sq_depth, "sq=%u"}, | ||
152 | {Opt_rq_depth, "rq=%u"}, | ||
153 | {Opt_timeout, "timeout=%u"}, | ||
154 | {Opt_err, NULL}, | ||
155 | }; | ||
156 | |||
157 | /** | ||
158 | * parse_options - parse mount options into session structure | ||
159 | * @options: options string passed from mount | ||
160 | * @opts: transport-specific structure to parse options into | ||
161 | * | ||
162 | * Returns 0 upon success, -ERRNO upon failure | ||
163 | */ | ||
164 | static int parse_opts(char *params, struct p9_rdma_opts *opts) | ||
165 | { | ||
166 | char *p; | ||
167 | substring_t args[MAX_OPT_ARGS]; | ||
168 | int option; | ||
169 | char *options; | ||
170 | int ret; | ||
171 | |||
172 | opts->port = P9_PORT; | ||
173 | opts->sq_depth = P9_RDMA_SQ_DEPTH; | ||
174 | opts->rq_depth = P9_RDMA_RQ_DEPTH; | ||
175 | opts->timeout = P9_RDMA_TIMEOUT; | ||
176 | |||
177 | if (!params) | ||
178 | return 0; | ||
179 | |||
180 | options = kstrdup(params, GFP_KERNEL); | ||
181 | if (!options) { | ||
182 | P9_DPRINTK(P9_DEBUG_ERROR, | ||
183 | "failed to allocate copy of option string\n"); | ||
184 | return -ENOMEM; | ||
185 | } | ||
186 | |||
187 | while ((p = strsep(&options, ",")) != NULL) { | ||
188 | int token; | ||
189 | int r; | ||
190 | if (!*p) | ||
191 | continue; | ||
192 | token = match_token(p, tokens, args); | ||
193 | r = match_int(&args[0], &option); | ||
194 | if (r < 0) { | ||
195 | P9_DPRINTK(P9_DEBUG_ERROR, | ||
196 | "integer field, but no integer?\n"); | ||
197 | ret = r; | ||
198 | continue; | ||
199 | } | ||
200 | switch (token) { | ||
201 | case Opt_port: | ||
202 | opts->port = option; | ||
203 | break; | ||
204 | case Opt_sq_depth: | ||
205 | opts->sq_depth = option; | ||
206 | break; | ||
207 | case Opt_rq_depth: | ||
208 | opts->rq_depth = option; | ||
209 | break; | ||
210 | case Opt_timeout: | ||
211 | opts->timeout = option; | ||
212 | break; | ||
213 | default: | ||
214 | continue; | ||
215 | } | ||
216 | } | ||
217 | /* RQ must be at least as large as the SQ */ | ||
218 | opts->rq_depth = max(opts->rq_depth, opts->sq_depth); | ||
219 | kfree(options); | ||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static int | ||
224 | p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) | ||
225 | { | ||
226 | struct p9_client *c = id->context; | ||
227 | struct p9_trans_rdma *rdma = c->trans; | ||
228 | switch (event->event) { | ||
229 | case RDMA_CM_EVENT_ADDR_RESOLVED: | ||
230 | BUG_ON(rdma->state != P9_RDMA_INIT); | ||
231 | rdma->state = P9_RDMA_ADDR_RESOLVED; | ||
232 | break; | ||
233 | |||
234 | case RDMA_CM_EVENT_ROUTE_RESOLVED: | ||
235 | BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); | ||
236 | rdma->state = P9_RDMA_ROUTE_RESOLVED; | ||
237 | break; | ||
238 | |||
239 | case RDMA_CM_EVENT_ESTABLISHED: | ||
240 | BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); | ||
241 | rdma->state = P9_RDMA_CONNECTED; | ||
242 | break; | ||
243 | |||
244 | case RDMA_CM_EVENT_DISCONNECTED: | ||
245 | if (rdma) | ||
246 | rdma->state = P9_RDMA_CLOSED; | ||
247 | if (c) | ||
248 | c->status = Disconnected; | ||
249 | break; | ||
250 | |||
251 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: | ||
252 | break; | ||
253 | |||
254 | case RDMA_CM_EVENT_ADDR_CHANGE: | ||
255 | case RDMA_CM_EVENT_ROUTE_ERROR: | ||
256 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | ||
257 | case RDMA_CM_EVENT_MULTICAST_JOIN: | ||
258 | case RDMA_CM_EVENT_MULTICAST_ERROR: | ||
259 | case RDMA_CM_EVENT_REJECTED: | ||
260 | case RDMA_CM_EVENT_CONNECT_REQUEST: | ||
261 | case RDMA_CM_EVENT_CONNECT_RESPONSE: | ||
262 | case RDMA_CM_EVENT_CONNECT_ERROR: | ||
263 | case RDMA_CM_EVENT_ADDR_ERROR: | ||
264 | case RDMA_CM_EVENT_UNREACHABLE: | ||
265 | c->status = Disconnected; | ||
266 | rdma_disconnect(rdma->cm_id); | ||
267 | break; | ||
268 | default: | ||
269 | BUG(); | ||
270 | } | ||
271 | complete(&rdma->cm_done); | ||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | static void | ||
276 | handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, | ||
277 | struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) | ||
278 | { | ||
279 | struct p9_req_t *req; | ||
280 | int err = 0; | ||
281 | int16_t tag; | ||
282 | |||
283 | req = NULL; | ||
284 | ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, | ||
285 | DMA_FROM_DEVICE); | ||
286 | |||
287 | if (status != IB_WC_SUCCESS) | ||
288 | goto err_out; | ||
289 | |||
290 | err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); | ||
291 | if (err) | ||
292 | goto err_out; | ||
293 | |||
294 | req = p9_tag_lookup(client, tag); | ||
295 | if (!req) | ||
296 | goto err_out; | ||
297 | |||
298 | req->rc = c->rc; | ||
299 | p9_client_cb(client, req); | ||
300 | |||
301 | return; | ||
302 | |||
303 | err_out: | ||
304 | P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n", | ||
305 | req, err, status); | ||
306 | rdma->state = P9_RDMA_FLUSHING; | ||
307 | client->status = Disconnected; | ||
308 | return; | ||
309 | } | ||
310 | |||
311 | static void | ||
312 | handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, | ||
313 | struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) | ||
314 | { | ||
315 | ib_dma_unmap_single(rdma->cm_id->device, | ||
316 | c->busa, c->req->tc->size, | ||
317 | DMA_TO_DEVICE); | ||
318 | } | ||
319 | |||
320 | static void qp_event_handler(struct ib_event *event, void *context) | ||
321 | { | ||
322 | P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, | ||
323 | context); | ||
324 | } | ||
325 | |||
326 | static void cq_comp_handler(struct ib_cq *cq, void *cq_context) | ||
327 | { | ||
328 | struct p9_client *client = cq_context; | ||
329 | struct p9_trans_rdma *rdma = client->trans; | ||
330 | int ret; | ||
331 | struct ib_wc wc; | ||
332 | |||
333 | ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); | ||
334 | while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { | ||
335 | struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; | ||
336 | |||
337 | switch (c->wc_op) { | ||
338 | case IB_WC_RECV: | ||
339 | atomic_dec(&rdma->rq_count); | ||
340 | handle_recv(client, rdma, c, wc.status, wc.byte_len); | ||
341 | break; | ||
342 | |||
343 | case IB_WC_SEND: | ||
344 | handle_send(client, rdma, c, wc.status, wc.byte_len); | ||
345 | up(&rdma->sq_sem); | ||
346 | break; | ||
347 | |||
348 | default: | ||
349 | printk(KERN_ERR "9prdma: unexpected completion type, " | ||
350 | "c->wc_op=%d, wc.opcode=%d, status=%d\n", | ||
351 | c->wc_op, wc.opcode, wc.status); | ||
352 | break; | ||
353 | } | ||
354 | kfree(c); | ||
355 | } | ||
356 | } | ||
357 | |||
358 | static void cq_event_handler(struct ib_event *e, void *v) | ||
359 | { | ||
360 | P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); | ||
361 | } | ||
362 | |||
363 | static void rdma_destroy_trans(struct p9_trans_rdma *rdma) | ||
364 | { | ||
365 | if (!rdma) | ||
366 | return; | ||
367 | |||
368 | if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) | ||
369 | ib_dereg_mr(rdma->dma_mr); | ||
370 | |||
371 | if (rdma->qp && !IS_ERR(rdma->qp)) | ||
372 | ib_destroy_qp(rdma->qp); | ||
373 | |||
374 | if (rdma->pd && !IS_ERR(rdma->pd)) | ||
375 | ib_dealloc_pd(rdma->pd); | ||
376 | |||
377 | if (rdma->cq && !IS_ERR(rdma->cq)) | ||
378 | ib_destroy_cq(rdma->cq); | ||
379 | |||
380 | if (rdma->cm_id && !IS_ERR(rdma->cm_id)) | ||
381 | rdma_destroy_id(rdma->cm_id); | ||
382 | |||
383 | kfree(rdma); | ||
384 | } | ||
385 | |||
386 | static int | ||
387 | post_recv(struct p9_client *client, struct p9_rdma_context *c) | ||
388 | { | ||
389 | struct p9_trans_rdma *rdma = client->trans; | ||
390 | struct ib_recv_wr wr, *bad_wr; | ||
391 | struct ib_sge sge; | ||
392 | |||
393 | c->busa = ib_dma_map_single(rdma->cm_id->device, | ||
394 | c->rc->sdata, client->msize, | ||
395 | DMA_FROM_DEVICE); | ||
396 | if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) | ||
397 | goto error; | ||
398 | |||
399 | sge.addr = c->busa; | ||
400 | sge.length = client->msize; | ||
401 | sge.lkey = rdma->lkey; | ||
402 | |||
403 | wr.next = NULL; | ||
404 | c->wc_op = IB_WC_RECV; | ||
405 | wr.wr_id = (unsigned long) c; | ||
406 | wr.sg_list = &sge; | ||
407 | wr.num_sge = 1; | ||
408 | return ib_post_recv(rdma->qp, &wr, &bad_wr); | ||
409 | |||
410 | error: | ||
411 | P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); | ||
412 | return -EIO; | ||
413 | } | ||
414 | |||
415 | static int rdma_request(struct p9_client *client, struct p9_req_t *req) | ||
416 | { | ||
417 | struct p9_trans_rdma *rdma = client->trans; | ||
418 | struct ib_send_wr wr, *bad_wr; | ||
419 | struct ib_sge sge; | ||
420 | int err = 0; | ||
421 | unsigned long flags; | ||
422 | struct p9_rdma_context *c = NULL; | ||
423 | struct p9_rdma_context *rpl_context = NULL; | ||
424 | |||
425 | /* Allocate an fcall for the reply */ | ||
426 | rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); | ||
427 | if (!rpl_context) | ||
428 | goto err_close; | ||
429 | |||
430 | /* | ||
431 | * If the request has a buffer, steal it, otherwise | ||
432 | * allocate a new one. Typically, requests should already | ||
433 | * have receive buffers allocated and just swap them around | ||
434 | */ | ||
435 | if (!req->rc) { | ||
436 | req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, | ||
437 | GFP_KERNEL); | ||
438 | if (req->rc) { | ||
439 | req->rc->sdata = (char *) req->rc + | ||
440 | sizeof(struct p9_fcall); | ||
441 | req->rc->capacity = client->msize; | ||
442 | } | ||
443 | } | ||
444 | rpl_context->rc = req->rc; | ||
445 | if (!rpl_context->rc) { | ||
446 | kfree(rpl_context); | ||
447 | goto err_close; | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * Post a receive buffer for this request. We need to ensure | ||
452 | * there is a reply buffer available for every outstanding | ||
453 | * request. A flushed request can result in no reply for an | ||
454 | * outstanding request, so we must keep a count to avoid | ||
455 | * overflowing the RQ. | ||
456 | */ | ||
457 | if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { | ||
458 | err = post_recv(client, rpl_context); | ||
459 | if (err) { | ||
460 | kfree(rpl_context->rc); | ||
461 | kfree(rpl_context); | ||
462 | goto err_close; | ||
463 | } | ||
464 | } else | ||
465 | atomic_dec(&rdma->rq_count); | ||
466 | |||
467 | /* remove posted receive buffer from request structure */ | ||
468 | req->rc = NULL; | ||
469 | |||
470 | /* Post the request */ | ||
471 | c = kmalloc(sizeof *c, GFP_KERNEL); | ||
472 | if (!c) | ||
473 | goto err_close; | ||
474 | c->req = req; | ||
475 | |||
476 | c->busa = ib_dma_map_single(rdma->cm_id->device, | ||
477 | c->req->tc->sdata, c->req->tc->size, | ||
478 | DMA_TO_DEVICE); | ||
479 | if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) | ||
480 | goto error; | ||
481 | |||
482 | sge.addr = c->busa; | ||
483 | sge.length = c->req->tc->size; | ||
484 | sge.lkey = rdma->lkey; | ||
485 | |||
486 | wr.next = NULL; | ||
487 | c->wc_op = IB_WC_SEND; | ||
488 | wr.wr_id = (unsigned long) c; | ||
489 | wr.opcode = IB_WR_SEND; | ||
490 | wr.send_flags = IB_SEND_SIGNALED; | ||
491 | wr.sg_list = &sge; | ||
492 | wr.num_sge = 1; | ||
493 | |||
494 | if (down_interruptible(&rdma->sq_sem)) | ||
495 | goto error; | ||
496 | |||
497 | return ib_post_send(rdma->qp, &wr, &bad_wr); | ||
498 | |||
499 | error: | ||
500 | P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); | ||
501 | return -EIO; | ||
502 | |||
503 | err_close: | ||
504 | spin_lock_irqsave(&rdma->req_lock, flags); | ||
505 | if (rdma->state < P9_RDMA_CLOSING) { | ||
506 | rdma->state = P9_RDMA_CLOSING; | ||
507 | spin_unlock_irqrestore(&rdma->req_lock, flags); | ||
508 | rdma_disconnect(rdma->cm_id); | ||
509 | } else | ||
510 | spin_unlock_irqrestore(&rdma->req_lock, flags); | ||
511 | return err; | ||
512 | } | ||
513 | |||
514 | static void rdma_close(struct p9_client *client) | ||
515 | { | ||
516 | struct p9_trans_rdma *rdma; | ||
517 | |||
518 | if (!client) | ||
519 | return; | ||
520 | |||
521 | rdma = client->trans; | ||
522 | if (!rdma) | ||
523 | return; | ||
524 | |||
525 | client->status = Disconnected; | ||
526 | rdma_disconnect(rdma->cm_id); | ||
527 | rdma_destroy_trans(rdma); | ||
528 | } | ||
529 | |||
530 | /** | ||
531 | * alloc_rdma - Allocate and initialize the rdma transport structure | ||
532 | * @msize: MTU | ||
533 | * @dotu: Extension attribute | ||
534 | * @opts: Mount options structure | ||
535 | */ | ||
536 | static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) | ||
537 | { | ||
538 | struct p9_trans_rdma *rdma; | ||
539 | |||
540 | rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); | ||
541 | if (!rdma) | ||
542 | return NULL; | ||
543 | |||
544 | rdma->sq_depth = opts->sq_depth; | ||
545 | rdma->rq_depth = opts->rq_depth; | ||
546 | rdma->timeout = opts->timeout; | ||
547 | spin_lock_init(&rdma->req_lock); | ||
548 | init_completion(&rdma->cm_done); | ||
549 | sema_init(&rdma->sq_sem, rdma->sq_depth); | ||
550 | atomic_set(&rdma->rq_count, 0); | ||
551 | |||
552 | return rdma; | ||
553 | } | ||
554 | |||
555 | /* its not clear to me we can do anything after send has been posted */ | ||
556 | static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) | ||
557 | { | ||
558 | return 1; | ||
559 | } | ||
560 | |||
561 | /** | ||
562 | * trans_create_rdma - Transport method for creating atransport instance | ||
563 | * @client: client instance | ||
564 | * @addr: IP address string | ||
565 | * @args: Mount options string | ||
566 | */ | ||
567 | static int | ||
568 | rdma_create_trans(struct p9_client *client, const char *addr, char *args) | ||
569 | { | ||
570 | int err; | ||
571 | struct p9_rdma_opts opts; | ||
572 | struct p9_trans_rdma *rdma; | ||
573 | struct rdma_conn_param conn_param; | ||
574 | struct ib_qp_init_attr qp_attr; | ||
575 | struct ib_device_attr devattr; | ||
576 | |||
577 | /* Parse the transport specific mount options */ | ||
578 | err = parse_opts(args, &opts); | ||
579 | if (err < 0) | ||
580 | return err; | ||
581 | |||
582 | /* Create and initialize the RDMA transport structure */ | ||
583 | rdma = alloc_rdma(&opts); | ||
584 | if (!rdma) | ||
585 | return -ENOMEM; | ||
586 | |||
587 | /* Create the RDMA CM ID */ | ||
588 | rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP); | ||
589 | if (IS_ERR(rdma->cm_id)) | ||
590 | goto error; | ||
591 | |||
592 | /* Resolve the server's address */ | ||
593 | rdma->addr.sin_family = AF_INET; | ||
594 | rdma->addr.sin_addr.s_addr = in_aton(addr); | ||
595 | rdma->addr.sin_port = htons(opts.port); | ||
596 | err = rdma_resolve_addr(rdma->cm_id, NULL, | ||
597 | (struct sockaddr *)&rdma->addr, | ||
598 | rdma->timeout); | ||
599 | if (err) | ||
600 | goto error; | ||
601 | err = wait_for_completion_interruptible(&rdma->cm_done); | ||
602 | if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) | ||
603 | goto error; | ||
604 | |||
605 | /* Resolve the route to the server */ | ||
606 | err = rdma_resolve_route(rdma->cm_id, rdma->timeout); | ||
607 | if (err) | ||
608 | goto error; | ||
609 | err = wait_for_completion_interruptible(&rdma->cm_done); | ||
610 | if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) | ||
611 | goto error; | ||
612 | |||
613 | /* Query the device attributes */ | ||
614 | err = ib_query_device(rdma->cm_id->device, &devattr); | ||
615 | if (err) | ||
616 | goto error; | ||
617 | |||
618 | /* Create the Completion Queue */ | ||
619 | rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, | ||
620 | cq_event_handler, client, | ||
621 | opts.sq_depth + opts.rq_depth + 1, 0); | ||
622 | if (IS_ERR(rdma->cq)) | ||
623 | goto error; | ||
624 | ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); | ||
625 | |||
626 | /* Create the Protection Domain */ | ||
627 | rdma->pd = ib_alloc_pd(rdma->cm_id->device); | ||
628 | if (IS_ERR(rdma->pd)) | ||
629 | goto error; | ||
630 | |||
631 | /* Cache the DMA lkey in the transport */ | ||
632 | rdma->dma_mr = NULL; | ||
633 | if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) | ||
634 | rdma->lkey = rdma->cm_id->device->local_dma_lkey; | ||
635 | else { | ||
636 | rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); | ||
637 | if (IS_ERR(rdma->dma_mr)) | ||
638 | goto error; | ||
639 | rdma->lkey = rdma->dma_mr->lkey; | ||
640 | } | ||
641 | |||
642 | /* Create the Queue Pair */ | ||
643 | memset(&qp_attr, 0, sizeof qp_attr); | ||
644 | qp_attr.event_handler = qp_event_handler; | ||
645 | qp_attr.qp_context = client; | ||
646 | qp_attr.cap.max_send_wr = opts.sq_depth; | ||
647 | qp_attr.cap.max_recv_wr = opts.rq_depth; | ||
648 | qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; | ||
649 | qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; | ||
650 | qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; | ||
651 | qp_attr.qp_type = IB_QPT_RC; | ||
652 | qp_attr.send_cq = rdma->cq; | ||
653 | qp_attr.recv_cq = rdma->cq; | ||
654 | err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); | ||
655 | if (err) | ||
656 | goto error; | ||
657 | rdma->qp = rdma->cm_id->qp; | ||
658 | |||
659 | /* Request a connection */ | ||
660 | memset(&conn_param, 0, sizeof(conn_param)); | ||
661 | conn_param.private_data = NULL; | ||
662 | conn_param.private_data_len = 0; | ||
663 | conn_param.responder_resources = P9_RDMA_IRD; | ||
664 | conn_param.initiator_depth = P9_RDMA_ORD; | ||
665 | err = rdma_connect(rdma->cm_id, &conn_param); | ||
666 | if (err) | ||
667 | goto error; | ||
668 | err = wait_for_completion_interruptible(&rdma->cm_done); | ||
669 | if (err || (rdma->state != P9_RDMA_CONNECTED)) | ||
670 | goto error; | ||
671 | |||
672 | client->trans = rdma; | ||
673 | client->status = Connected; | ||
674 | |||
675 | return 0; | ||
676 | |||
677 | error: | ||
678 | rdma_destroy_trans(rdma); | ||
679 | return -ENOTCONN; | ||
680 | } | ||
681 | |||
682 | static struct p9_trans_module p9_rdma_trans = { | ||
683 | .name = "rdma", | ||
684 | .maxsize = P9_RDMA_MAXSIZE, | ||
685 | .def = 0, | ||
686 | .owner = THIS_MODULE, | ||
687 | .create = rdma_create_trans, | ||
688 | .close = rdma_close, | ||
689 | .request = rdma_request, | ||
690 | .cancel = rdma_cancel, | ||
691 | }; | ||
692 | |||
693 | /** | ||
694 | * p9_trans_rdma_init - Register the 9P RDMA transport driver | ||
695 | */ | ||
696 | static int __init p9_trans_rdma_init(void) | ||
697 | { | ||
698 | v9fs_register_trans(&p9_rdma_trans); | ||
699 | return 0; | ||
700 | } | ||
701 | |||
702 | static void __exit p9_trans_rdma_exit(void) | ||
703 | { | ||
704 | v9fs_unregister_trans(&p9_rdma_trans); | ||
705 | } | ||
706 | |||
707 | module_init(p9_trans_rdma_init); | ||
708 | module_exit(p9_trans_rdma_exit); | ||
709 | |||
710 | MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); | ||
711 | MODULE_DESCRIPTION("RDMA Transport for 9P"); | ||
712 | MODULE_LICENSE("Dual BSD/GPL"); | ||