aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJ. Bruce Fields <bfields@citi.umich.edu>2008-05-20 19:57:38 -0400
committerJ. Bruce Fields <bfields@citi.umich.edu>2008-05-20 19:57:38 -0400
commit68432a03f8ff93a9eecee95cc8f02e7fe2025cc9 (patch)
tree9a9e98a0233a7f3b4d66df19d8ded26d0b41c0ed
parentd71a4dd72e67210ae0767ccae69c79f1c933ff64 (diff)
parenta6f911c04e20b98feb4b33d3aba2976851977d6a (diff)
Merge branch 'from-tomtucker' into for-2.6.26
-rw-r--r--include/linux/sunrpc/svc_rdma.h12
-rw-r--r--net/sunrpc/svc_xprt.c23
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c102
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c11
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c290
5 files changed, 223 insertions, 215 deletions
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index c11bbcc081f9..05eb4664d0dd 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -71,7 +71,8 @@ extern atomic_t rdma_stat_sq_prod;
71 * completes. 71 * completes.
72 */ 72 */
73struct svc_rdma_op_ctxt { 73struct svc_rdma_op_ctxt {
74 struct svc_rdma_op_ctxt *next; 74 struct svc_rdma_op_ctxt *read_hdr;
75 struct list_head free_list;
75 struct xdr_buf arg; 76 struct xdr_buf arg;
76 struct list_head dto_q; 77 struct list_head dto_q;
77 enum ib_wr_opcode wr_op; 78 enum ib_wr_opcode wr_op;
@@ -85,7 +86,6 @@ struct svc_rdma_op_ctxt {
85 struct page *pages[RPCSVC_MAXPAGES]; 86 struct page *pages[RPCSVC_MAXPAGES];
86}; 87};
87 88
88#define RDMACTXT_F_READ_DONE 1
89#define RDMACTXT_F_LAST_CTXT 2 89#define RDMACTXT_F_LAST_CTXT 2
90 90
91struct svcxprt_rdma { 91struct svcxprt_rdma {
@@ -104,7 +104,8 @@ struct svcxprt_rdma {
104 104
105 struct ib_pd *sc_pd; 105 struct ib_pd *sc_pd;
106 106
107 struct svc_rdma_op_ctxt *sc_ctxt_head; 107 atomic_t sc_ctxt_used;
108 struct list_head sc_ctxt_free;
108 int sc_ctxt_cnt; 109 int sc_ctxt_cnt;
109 int sc_ctxt_bump; 110 int sc_ctxt_bump;
110 int sc_ctxt_max; 111 int sc_ctxt_max;
@@ -123,6 +124,7 @@ struct svcxprt_rdma {
123 struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */ 124 struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */
124 struct list_head sc_read_complete_q; 125 struct list_head sc_read_complete_q;
125 spinlock_t sc_read_complete_lock; 126 spinlock_t sc_read_complete_lock;
127 struct work_struct sc_work;
126}; 128};
127/* sc_flags */ 129/* sc_flags */
128#define RDMAXPRT_RQ_PENDING 1 130#define RDMAXPRT_RQ_PENDING 1
@@ -164,8 +166,8 @@ extern int svc_rdma_sendto(struct svc_rqst *);
164 166
165/* svc_rdma_transport.c */ 167/* svc_rdma_transport.c */
166extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); 168extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
167extern int svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, 169extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
168 enum rpcrdma_errcode); 170 enum rpcrdma_errcode);
169struct page *svc_rdma_get_page(void); 171struct page *svc_rdma_get_page(void);
170extern int svc_rdma_post_recv(struct svcxprt_rdma *); 172extern int svc_rdma_post_recv(struct svcxprt_rdma *);
171extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); 173extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index d8e8d79a8451..e46c825f4954 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -6,30 +6,9 @@
6 6
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/fcntl.h>
10#include <linux/net.h>
11#include <linux/in.h>
12#include <linux/inet.h>
13#include <linux/udp.h>
14#include <linux/tcp.h>
15#include <linux/unistd.h>
16#include <linux/slab.h>
17#include <linux/netdevice.h>
18#include <linux/skbuff.h>
19#include <linux/file.h>
20#include <linux/freezer.h> 9#include <linux/freezer.h>
21#include <linux/kthread.h> 10#include <linux/kthread.h>
22#include <net/sock.h> 11#include <net/sock.h>
23#include <net/checksum.h>
24#include <net/ip.h>
25#include <net/ipv6.h>
26#include <net/tcp_states.h>
27#include <linux/uaccess.h>
28#include <asm/ioctls.h>
29
30#include <linux/sunrpc/types.h>
31#include <linux/sunrpc/clnt.h>
32#include <linux/sunrpc/xdr.h>
33#include <linux/sunrpc/stats.h> 12#include <linux/sunrpc/stats.h>
34#include <linux/sunrpc/svc_xprt.h> 13#include <linux/sunrpc/svc_xprt.h>
35 14
@@ -296,8 +275,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
296 if (!(xprt->xpt_flags & 275 if (!(xprt->xpt_flags &
297 ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED)))) 276 ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
298 return; 277 return;
299 if (test_bit(XPT_DEAD, &xprt->xpt_flags))
300 return;
301 278
302 cpu = get_cpu(); 279 cpu = get_cpu();
303 pool = svc_pool_for_cpu(xprt->xpt_server, cpu); 280 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c22d6b6f2db4..06ab4841537b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -260,11 +260,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
260 * On our side, we need to read into a pagelist. The first page immediately 260 * On our side, we need to read into a pagelist. The first page immediately
261 * follows the RPC header. 261 * follows the RPC header.
262 * 262 *
263 * This function returns 1 to indicate success. The data is not yet in 263 * This function returns:
264 * 0 - No error and no read-list found.
265 *
266 * 1 - Successful read-list processing. The data is not yet in
264 * the pagelist and therefore the RPC request must be deferred. The 267 * the pagelist and therefore the RPC request must be deferred. The
265 * I/O completion will enqueue the transport again and 268 * I/O completion will enqueue the transport again and
266 * svc_rdma_recvfrom will complete the request. 269 * svc_rdma_recvfrom will complete the request.
267 * 270 *
271 * <0 - Error processing/posting read-list.
272 *
268 * NOTE: The ctxt must not be touched after the last WR has been posted 273 * NOTE: The ctxt must not be touched after the last WR has been posted
269 * because the I/O completion processing may occur on another 274 * because the I/O completion processing may occur on another
270 * processor and free / modify the context. Ne touche pas! 275 * processor and free / modify the context. Ne touche pas!
@@ -284,7 +289,6 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
284 u64 sgl_offset; 289 u64 sgl_offset;
285 struct rpcrdma_read_chunk *ch; 290 struct rpcrdma_read_chunk *ch;
286 struct svc_rdma_op_ctxt *ctxt = NULL; 291 struct svc_rdma_op_ctxt *ctxt = NULL;
287 struct svc_rdma_op_ctxt *head;
288 struct svc_rdma_op_ctxt *tmp_sge_ctxt; 292 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
289 struct svc_rdma_op_ctxt *tmp_ch_ctxt; 293 struct svc_rdma_op_ctxt *tmp_ch_ctxt;
290 struct chunk_sge *ch_sge_ary; 294 struct chunk_sge *ch_sge_ary;
@@ -302,25 +306,19 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
302 ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; 306 ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
303 307
304 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); 308 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
309 if (ch_count > RPCSVC_MAXPAGES)
310 return -EINVAL;
305 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, 311 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
306 sge, ch_sge_ary, 312 sge, ch_sge_ary,
307 ch_count, byte_count); 313 ch_count, byte_count);
308 head = svc_rdma_get_context(xprt);
309 sgl_offset = 0; 314 sgl_offset = 0;
310 ch_no = 0; 315 ch_no = 0;
311 316
312 for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 317 for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
313 ch->rc_discrim != 0; ch++, ch_no++) { 318 ch->rc_discrim != 0; ch++, ch_no++) {
314next_sge: 319next_sge:
315 if (!ctxt) 320 ctxt = svc_rdma_get_context(xprt);
316 ctxt = head;
317 else {
318 ctxt->next = svc_rdma_get_context(xprt);
319 ctxt = ctxt->next;
320 }
321 ctxt->next = NULL;
322 ctxt->direction = DMA_FROM_DEVICE; 321 ctxt->direction = DMA_FROM_DEVICE;
323 clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
324 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 322 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
325 323
326 /* Prepare READ WR */ 324 /* Prepare READ WR */
@@ -347,20 +345,15 @@ next_sge:
347 * the client and the RPC needs to be enqueued. 345 * the client and the RPC needs to be enqueued.
348 */ 346 */
349 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 347 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
350 ctxt->next = hdr_ctxt; 348 ctxt->read_hdr = hdr_ctxt;
351 hdr_ctxt->next = head;
352 } 349 }
353 /* Post the read */ 350 /* Post the read */
354 err = svc_rdma_send(xprt, &read_wr); 351 err = svc_rdma_send(xprt, &read_wr);
355 if (err) { 352 if (err) {
356 printk(KERN_ERR "svcrdma: Error posting send = %d\n", 353 printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
357 err); 354 err);
358 /* 355 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
359 * Break the circular list so free knows when 356 svc_rdma_put_context(ctxt, 0);
360 * to stop if the error happened to occur on
361 * the last read
362 */
363 ctxt->next = NULL;
364 goto out; 357 goto out;
365 } 358 }
366 atomic_inc(&rdma_stat_read); 359 atomic_inc(&rdma_stat_read);
@@ -371,7 +364,7 @@ next_sge:
371 goto next_sge; 364 goto next_sge;
372 } 365 }
373 sgl_offset = 0; 366 sgl_offset = 0;
374 err = 0; 367 err = 1;
375 } 368 }
376 369
377 out: 370 out:
@@ -389,25 +382,12 @@ next_sge:
389 while (rqstp->rq_resused) 382 while (rqstp->rq_resused)
390 rqstp->rq_respages[--rqstp->rq_resused] = NULL; 383 rqstp->rq_respages[--rqstp->rq_resused] = NULL;
391 384
392 if (err) { 385 return err;
393 printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
394 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
395 /* Free the linked list of read contexts */
396 while (head != NULL) {
397 ctxt = head->next;
398 svc_rdma_put_context(head, 1);
399 head = ctxt;
400 }
401 return 0;
402 }
403
404 return 1;
405} 386}
406 387
407static int rdma_read_complete(struct svc_rqst *rqstp, 388static int rdma_read_complete(struct svc_rqst *rqstp,
408 struct svc_rdma_op_ctxt *data) 389 struct svc_rdma_op_ctxt *head)
409{ 390{
410 struct svc_rdma_op_ctxt *head = data->next;
411 int page_no; 391 int page_no;
412 int ret; 392 int ret;
413 393
@@ -433,21 +413,12 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
433 rqstp->rq_arg.len = head->arg.len; 413 rqstp->rq_arg.len = head->arg.len;
434 rqstp->rq_arg.buflen = head->arg.buflen; 414 rqstp->rq_arg.buflen = head->arg.buflen;
435 415
416 /* Free the context */
417 svc_rdma_put_context(head, 0);
418
436 /* XXX: What should this be? */ 419 /* XXX: What should this be? */
437 rqstp->rq_prot = IPPROTO_MAX; 420 rqstp->rq_prot = IPPROTO_MAX;
438 421 svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
439 /*
440 * Free the contexts we used to build the RDMA_READ. We have
441 * to be careful here because the context list uses the same
442 * next pointer used to chain the contexts associated with the
443 * RDMA_READ
444 */
445 data->next = NULL; /* terminate circular list */
446 do {
447 data = head->next;
448 svc_rdma_put_context(head, 0);
449 head = data;
450 } while (head != NULL);
451 422
452 ret = rqstp->rq_arg.head[0].iov_len 423 ret = rqstp->rq_arg.head[0].iov_len
453 + rqstp->rq_arg.page_len 424 + rqstp->rq_arg.page_len
@@ -457,8 +428,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
457 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, 428 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
458 rqstp->rq_arg.head[0].iov_len); 429 rqstp->rq_arg.head[0].iov_len);
459 430
460 /* Indicate that we've consumed an RQ credit */
461 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
462 svc_xprt_received(rqstp->rq_xprt); 431 svc_xprt_received(rqstp->rq_xprt);
463 return ret; 432 return ret;
464} 433}
@@ -480,13 +449,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
480 449
481 dprintk("svcrdma: rqstp=%p\n", rqstp); 450 dprintk("svcrdma: rqstp=%p\n", rqstp);
482 451
483 /*
484 * The rq_xprt_ctxt indicates if we've consumed an RQ credit
485 * or not. It is used in the rdma xpo_release_rqst function to
486 * determine whether or not to return an RQ WQE to the RQ.
487 */
488 rqstp->rq_xprt_ctxt = NULL;
489
490 spin_lock_bh(&rdma_xprt->sc_read_complete_lock); 452 spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
491 if (!list_empty(&rdma_xprt->sc_read_complete_q)) { 453 if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
492 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, 454 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
@@ -537,21 +499,22 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
537 /* If the request is invalid, reply with an error */ 499 /* If the request is invalid, reply with an error */
538 if (len < 0) { 500 if (len < 0) {
539 if (len == -ENOSYS) 501 if (len == -ENOSYS)
540 (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); 502 svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
541 goto close_out; 503 goto close_out;
542 } 504 }
543 505
544 /* Read read-list data. If we would need to wait, defer 506 /* Read read-list data. */
545 * it. Not that in this case, we don't return the RQ credit 507 ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
546 * until after the read completes. 508 if (ret > 0) {
547 */ 509 /* read-list posted, defer until data received from client. */
548 if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
549 svc_xprt_received(xprt); 510 svc_xprt_received(xprt);
550 return 0; 511 return 0;
551 } 512 }
552 513 if (ret < 0) {
553 /* Indicate we've consumed an RQ credit */ 514 /* Post of read-list failed, free context. */
554 rqstp->rq_xprt_ctxt = rqstp->rq_xprt; 515 svc_rdma_put_context(ctxt, 1);
516 return 0;
517 }
555 518
556 ret = rqstp->rq_arg.head[0].iov_len 519 ret = rqstp->rq_arg.head[0].iov_len
557 + rqstp->rq_arg.page_len 520 + rqstp->rq_arg.page_len
@@ -569,11 +532,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
569 return ret; 532 return ret;
570 533
571 close_out: 534 close_out:
572 if (ctxt) { 535 if (ctxt)
573 svc_rdma_put_context(ctxt, 1); 536 svc_rdma_put_context(ctxt, 1);
574 /* Indicate we've consumed an RQ credit */
575 rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
576 }
577 dprintk("svcrdma: transport %p is closing\n", xprt); 537 dprintk("svcrdma: transport %p is closing\n", xprt);
578 /* 538 /*
579 * Set the close bit and enqueue it. svc_recv will see the 539 * Set the close bit and enqueue it. svc_recv will see the
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 981f190c1b39..fb82b1b683f8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -389,6 +389,17 @@ static int send_reply(struct svcxprt_rdma *rdma,
389 int page_no; 389 int page_no;
390 int ret; 390 int ret;
391 391
392 /* Post a recv buffer to handle another request. */
393 ret = svc_rdma_post_recv(rdma);
394 if (ret) {
395 printk(KERN_INFO
396 "svcrdma: could not post a receive buffer, err=%d."
397 "Closing transport %p.\n", ret, rdma);
398 set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
399 svc_rdma_put_context(ctxt, 0);
400 return -ENOTCONN;
401 }
402
392 /* Prepare the context */ 403 /* Prepare the context */
393 ctxt->pages[0] = page; 404 ctxt->pages[0] = page;
394 ctxt->count = 1; 405 ctxt->count = 1;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index af408fc12634..e132509d1db0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -103,8 +103,8 @@ static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
103 spin_lock_bh(&xprt->sc_ctxt_lock); 103 spin_lock_bh(&xprt->sc_ctxt_lock);
104 if (ctxt) { 104 if (ctxt) {
105 at_least_one = 1; 105 at_least_one = 1;
106 ctxt->next = xprt->sc_ctxt_head; 106 INIT_LIST_HEAD(&ctxt->free_list);
107 xprt->sc_ctxt_head = ctxt; 107 list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
108 } else { 108 } else {
109 /* kmalloc failed...give up for now */ 109 /* kmalloc failed...give up for now */
110 xprt->sc_ctxt_cnt--; 110 xprt->sc_ctxt_cnt--;
@@ -123,7 +123,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
123 123
124 while (1) { 124 while (1) {
125 spin_lock_bh(&xprt->sc_ctxt_lock); 125 spin_lock_bh(&xprt->sc_ctxt_lock);
126 if (unlikely(xprt->sc_ctxt_head == NULL)) { 126 if (unlikely(list_empty(&xprt->sc_ctxt_free))) {
127 /* Try to bump my cache. */ 127 /* Try to bump my cache. */
128 spin_unlock_bh(&xprt->sc_ctxt_lock); 128 spin_unlock_bh(&xprt->sc_ctxt_lock);
129 129
@@ -136,12 +136,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
136 schedule_timeout_uninterruptible(msecs_to_jiffies(500)); 136 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
137 continue; 137 continue;
138 } 138 }
139 ctxt = xprt->sc_ctxt_head; 139 ctxt = list_entry(xprt->sc_ctxt_free.next,
140 xprt->sc_ctxt_head = ctxt->next; 140 struct svc_rdma_op_ctxt,
141 free_list);
142 list_del_init(&ctxt->free_list);
141 spin_unlock_bh(&xprt->sc_ctxt_lock); 143 spin_unlock_bh(&xprt->sc_ctxt_lock);
142 ctxt->xprt = xprt; 144 ctxt->xprt = xprt;
143 INIT_LIST_HEAD(&ctxt->dto_q); 145 INIT_LIST_HEAD(&ctxt->dto_q);
144 ctxt->count = 0; 146 ctxt->count = 0;
147 atomic_inc(&xprt->sc_ctxt_used);
145 break; 148 break;
146 } 149 }
147 return ctxt; 150 return ctxt;
@@ -159,14 +162,15 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
159 put_page(ctxt->pages[i]); 162 put_page(ctxt->pages[i]);
160 163
161 for (i = 0; i < ctxt->count; i++) 164 for (i = 0; i < ctxt->count; i++)
162 dma_unmap_single(xprt->sc_cm_id->device->dma_device, 165 ib_dma_unmap_single(xprt->sc_cm_id->device,
163 ctxt->sge[i].addr, 166 ctxt->sge[i].addr,
164 ctxt->sge[i].length, 167 ctxt->sge[i].length,
165 ctxt->direction); 168 ctxt->direction);
169
166 spin_lock_bh(&xprt->sc_ctxt_lock); 170 spin_lock_bh(&xprt->sc_ctxt_lock);
167 ctxt->next = xprt->sc_ctxt_head; 171 list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
168 xprt->sc_ctxt_head = ctxt;
169 spin_unlock_bh(&xprt->sc_ctxt_lock); 172 spin_unlock_bh(&xprt->sc_ctxt_lock);
173 atomic_dec(&xprt->sc_ctxt_used);
170} 174}
171 175
172/* ib_cq event handler */ 176/* ib_cq event handler */
@@ -228,23 +232,8 @@ static void dto_tasklet_func(unsigned long data)
228 list_del_init(&xprt->sc_dto_q); 232 list_del_init(&xprt->sc_dto_q);
229 spin_unlock_irqrestore(&dto_lock, flags); 233 spin_unlock_irqrestore(&dto_lock, flags);
230 234
231 if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { 235 rq_cq_reap(xprt);
232 ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); 236 sq_cq_reap(xprt);
233 rq_cq_reap(xprt);
234 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
235 /*
236 * If data arrived before established event,
237 * don't enqueue. This defers RPC I/O until the
238 * RDMA connection is complete.
239 */
240 if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
241 svc_xprt_enqueue(&xprt->sc_xprt);
242 }
243
244 if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
245 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
246 sq_cq_reap(xprt);
247 }
248 237
249 svc_xprt_put(&xprt->sc_xprt); 238 svc_xprt_put(&xprt->sc_xprt);
250 spin_lock_irqsave(&dto_lock, flags); 239 spin_lock_irqsave(&dto_lock, flags);
@@ -263,11 +252,15 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
263 struct svcxprt_rdma *xprt = cq_context; 252 struct svcxprt_rdma *xprt = cq_context;
264 unsigned long flags; 253 unsigned long flags;
265 254
255 /* Guard against unconditional flush call for destroyed QP */
256 if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
257 return;
258
266 /* 259 /*
267 * Set the bit regardless of whether or not it's on the list 260 * Set the bit regardless of whether or not it's on the list
268 * because it may be on the list already due to an SQ 261 * because it may be on the list already due to an SQ
269 * completion. 262 * completion.
270 */ 263 */
271 set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); 264 set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
272 265
273 /* 266 /*
@@ -290,6 +283,8 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
290 * 283 *
291 * Take all completing WC off the CQE and enqueue the associated DTO 284 * Take all completing WC off the CQE and enqueue the associated DTO
292 * context on the dto_q for the transport. 285 * context on the dto_q for the transport.
286 *
287 * Note that caller must hold a transport reference.
293 */ 288 */
294static void rq_cq_reap(struct svcxprt_rdma *xprt) 289static void rq_cq_reap(struct svcxprt_rdma *xprt)
295{ 290{
@@ -297,29 +292,47 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
297 struct ib_wc wc; 292 struct ib_wc wc;
298 struct svc_rdma_op_ctxt *ctxt = NULL; 293 struct svc_rdma_op_ctxt *ctxt = NULL;
299 294
295 if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
296 return;
297
298 ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
300 atomic_inc(&rdma_stat_rq_poll); 299 atomic_inc(&rdma_stat_rq_poll);
301 300
302 spin_lock_bh(&xprt->sc_rq_dto_lock);
303 while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { 301 while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
304 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 302 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
305 ctxt->wc_status = wc.status; 303 ctxt->wc_status = wc.status;
306 ctxt->byte_len = wc.byte_len; 304 ctxt->byte_len = wc.byte_len;
307 if (wc.status != IB_WC_SUCCESS) { 305 if (wc.status != IB_WC_SUCCESS) {
308 /* Close the transport */ 306 /* Close the transport */
307 dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
309 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 308 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
310 svc_rdma_put_context(ctxt, 1); 309 svc_rdma_put_context(ctxt, 1);
310 svc_xprt_put(&xprt->sc_xprt);
311 continue; 311 continue;
312 } 312 }
313 spin_lock_bh(&xprt->sc_rq_dto_lock);
313 list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); 314 list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
315 spin_unlock_bh(&xprt->sc_rq_dto_lock);
316 svc_xprt_put(&xprt->sc_xprt);
314 } 317 }
315 spin_unlock_bh(&xprt->sc_rq_dto_lock);
316 318
317 if (ctxt) 319 if (ctxt)
318 atomic_inc(&rdma_stat_rq_prod); 320 atomic_inc(&rdma_stat_rq_prod);
321
322 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
323 /*
324 * If data arrived before established event,
325 * don't enqueue. This defers RPC I/O until the
326 * RDMA connection is complete.
327 */
328 if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
329 svc_xprt_enqueue(&xprt->sc_xprt);
319} 330}
320 331
321/* 332/*
322 * Send Queue Completion Handler - potentially called on interrupt context. 333 * Send Queue Completion Handler - potentially called on interrupt context.
334 *
335 * Note that caller must hold a transport reference.
323 */ 336 */
324static void sq_cq_reap(struct svcxprt_rdma *xprt) 337static void sq_cq_reap(struct svcxprt_rdma *xprt)
325{ 338{
@@ -328,6 +341,11 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
328 struct ib_cq *cq = xprt->sc_sq_cq; 341 struct ib_cq *cq = xprt->sc_sq_cq;
329 int ret; 342 int ret;
330 343
344
345 if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
346 return;
347
348 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
331 atomic_inc(&rdma_stat_sq_poll); 349 atomic_inc(&rdma_stat_sq_poll);
332 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { 350 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
333 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 351 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
@@ -349,14 +367,16 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
349 367
350 case IB_WR_RDMA_READ: 368 case IB_WR_RDMA_READ:
351 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 369 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
370 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
371 BUG_ON(!read_hdr);
352 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 372 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
353 set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
354 spin_lock_bh(&xprt->sc_read_complete_lock); 373 spin_lock_bh(&xprt->sc_read_complete_lock);
355 list_add_tail(&ctxt->dto_q, 374 list_add_tail(&read_hdr->dto_q,
356 &xprt->sc_read_complete_q); 375 &xprt->sc_read_complete_q);
357 spin_unlock_bh(&xprt->sc_read_complete_lock); 376 spin_unlock_bh(&xprt->sc_read_complete_lock);
358 svc_xprt_enqueue(&xprt->sc_xprt); 377 svc_xprt_enqueue(&xprt->sc_xprt);
359 } 378 }
379 svc_rdma_put_context(ctxt, 0);
360 break; 380 break;
361 381
362 default: 382 default:
@@ -365,6 +385,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
365 wc.opcode, wc.status); 385 wc.opcode, wc.status);
366 break; 386 break;
367 } 387 }
388 svc_xprt_put(&xprt->sc_xprt);
368 } 389 }
369 390
370 if (ctxt) 391 if (ctxt)
@@ -376,11 +397,15 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
376 struct svcxprt_rdma *xprt = cq_context; 397 struct svcxprt_rdma *xprt = cq_context;
377 unsigned long flags; 398 unsigned long flags;
378 399
400 /* Guard against unconditional flush call for destroyed QP */
401 if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
402 return;
403
379 /* 404 /*
380 * Set the bit regardless of whether or not it's on the list 405 * Set the bit regardless of whether or not it's on the list
381 * because it may be on the list already due to an RQ 406 * because it may be on the list already due to an RQ
382 * completion. 407 * completion.
383 */ 408 */
384 set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); 409 set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
385 410
386 /* 411 /*
@@ -407,28 +432,29 @@ static void create_context_cache(struct svcxprt_rdma *xprt,
407 xprt->sc_ctxt_max = ctxt_max; 432 xprt->sc_ctxt_max = ctxt_max;
408 xprt->sc_ctxt_bump = ctxt_bump; 433 xprt->sc_ctxt_bump = ctxt_bump;
409 xprt->sc_ctxt_cnt = 0; 434 xprt->sc_ctxt_cnt = 0;
410 xprt->sc_ctxt_head = NULL; 435 atomic_set(&xprt->sc_ctxt_used, 0);
436
437 INIT_LIST_HEAD(&xprt->sc_ctxt_free);
411 for (i = 0; i < ctxt_count; i++) { 438 for (i = 0; i < ctxt_count; i++) {
412 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); 439 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
413 if (ctxt) { 440 if (ctxt) {
414 ctxt->next = xprt->sc_ctxt_head; 441 INIT_LIST_HEAD(&ctxt->free_list);
415 xprt->sc_ctxt_head = ctxt; 442 list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
416 xprt->sc_ctxt_cnt++; 443 xprt->sc_ctxt_cnt++;
417 } 444 }
418 } 445 }
419} 446}
420 447
421static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) 448static void destroy_context_cache(struct svcxprt_rdma *xprt)
422{ 449{
423 struct svc_rdma_op_ctxt *next; 450 while (!list_empty(&xprt->sc_ctxt_free)) {
424 if (!ctxt) 451 struct svc_rdma_op_ctxt *ctxt;
425 return; 452 ctxt = list_entry(xprt->sc_ctxt_free.next,
426 453 struct svc_rdma_op_ctxt,
427 do { 454 free_list);
428 next = ctxt->next; 455 list_del_init(&ctxt->free_list);
429 kfree(ctxt); 456 kfree(ctxt);
430 ctxt = next; 457 }
431 } while (next);
432} 458}
433 459
434static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, 460static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
@@ -465,7 +491,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
465 reqs + 491 reqs +
466 cma_xprt->sc_sq_depth + 492 cma_xprt->sc_sq_depth +
467 RPCRDMA_MAX_THREADS + 1); /* max */ 493 RPCRDMA_MAX_THREADS + 1); /* max */
468 if (!cma_xprt->sc_ctxt_head) { 494 if (list_empty(&cma_xprt->sc_ctxt_free)) {
469 kfree(cma_xprt); 495 kfree(cma_xprt);
470 return NULL; 496 return NULL;
471 } 497 }
@@ -520,7 +546,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
520 recv_wr.num_sge = ctxt->count; 546 recv_wr.num_sge = ctxt->count;
521 recv_wr.wr_id = (u64)(unsigned long)ctxt; 547 recv_wr.wr_id = (u64)(unsigned long)ctxt;
522 548
549 svc_xprt_get(&xprt->sc_xprt);
523 ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); 550 ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
551 if (ret) {
552 svc_xprt_put(&xprt->sc_xprt);
553 svc_rdma_put_context(ctxt, 1);
554 }
524 return ret; 555 return ret;
525} 556}
526 557
@@ -539,6 +570,7 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
539{ 570{
540 struct svcxprt_rdma *listen_xprt = new_cma_id->context; 571 struct svcxprt_rdma *listen_xprt = new_cma_id->context;
541 struct svcxprt_rdma *newxprt; 572 struct svcxprt_rdma *newxprt;
573 struct sockaddr *sa;
542 574
543 /* Create a new transport */ 575 /* Create a new transport */
544 newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); 576 newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
@@ -551,6 +583,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
551 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", 583 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
552 newxprt, newxprt->sc_cm_id, listen_xprt); 584 newxprt, newxprt->sc_cm_id, listen_xprt);
553 585
586 /* Set the local and remote addresses in the transport */
587 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
588 svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
589 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
590 svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
591
554 /* 592 /*
555 * Enqueue the new transport on the accept queue of the listening 593 * Enqueue the new transport on the accept queue of the listening
556 * transport 594 * transport
@@ -627,6 +665,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
627 if (xprt) { 665 if (xprt) {
628 set_bit(XPT_CLOSE, &xprt->xpt_flags); 666 set_bit(XPT_CLOSE, &xprt->xpt_flags);
629 svc_xprt_enqueue(xprt); 667 svc_xprt_enqueue(xprt);
668 svc_xprt_put(xprt);
630 } 669 }
631 break; 670 break;
632 case RDMA_CM_EVENT_DEVICE_REMOVAL: 671 case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -661,31 +700,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
661 700
662 cma_xprt = rdma_create_xprt(serv, 1); 701 cma_xprt = rdma_create_xprt(serv, 1);
663 if (!cma_xprt) 702 if (!cma_xprt)
664 return ERR_PTR(ENOMEM); 703 return ERR_PTR(-ENOMEM);
665 xprt = &cma_xprt->sc_xprt; 704 xprt = &cma_xprt->sc_xprt;
666 705
667 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); 706 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
668 if (IS_ERR(listen_id)) { 707 if (IS_ERR(listen_id)) {
669 svc_xprt_put(&cma_xprt->sc_xprt); 708 ret = PTR_ERR(listen_id);
670 dprintk("svcrdma: rdma_create_id failed = %ld\n", 709 dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
671 PTR_ERR(listen_id)); 710 goto err0;
672 return (void *)listen_id;
673 } 711 }
712
674 ret = rdma_bind_addr(listen_id, sa); 713 ret = rdma_bind_addr(listen_id, sa);
675 if (ret) { 714 if (ret) {
676 rdma_destroy_id(listen_id);
677 svc_xprt_put(&cma_xprt->sc_xprt);
678 dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); 715 dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
679 return ERR_PTR(ret); 716 goto err1;
680 } 717 }
681 cma_xprt->sc_cm_id = listen_id; 718 cma_xprt->sc_cm_id = listen_id;
682 719
683 ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); 720 ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
684 if (ret) { 721 if (ret) {
685 rdma_destroy_id(listen_id);
686 svc_xprt_put(&cma_xprt->sc_xprt);
687 dprintk("svcrdma: rdma_listen failed = %d\n", ret); 722 dprintk("svcrdma: rdma_listen failed = %d\n", ret);
688 return ERR_PTR(ret); 723 goto err1;
689 } 724 }
690 725
691 /* 726 /*
@@ -696,6 +731,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
696 svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); 731 svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
697 732
698 return &cma_xprt->sc_xprt; 733 return &cma_xprt->sc_xprt;
734
735 err1:
736 rdma_destroy_id(listen_id);
737 err0:
738 kfree(cma_xprt);
739 return ERR_PTR(ret);
699} 740}
700 741
701/* 742/*
@@ -716,7 +757,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
716 struct rdma_conn_param conn_param; 757 struct rdma_conn_param conn_param;
717 struct ib_qp_init_attr qp_attr; 758 struct ib_qp_init_attr qp_attr;
718 struct ib_device_attr devattr; 759 struct ib_device_attr devattr;
719 struct sockaddr *sa;
720 int ret; 760 int ret;
721 int i; 761 int i;
722 762
@@ -826,7 +866,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
826 newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; 866 newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
827 newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; 867 newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
828 } 868 }
829 svc_xprt_get(&newxprt->sc_xprt);
830 newxprt->sc_qp = newxprt->sc_cm_id->qp; 869 newxprt->sc_qp = newxprt->sc_cm_id->qp;
831 870
832 /* Register all of physical memory */ 871 /* Register all of physical memory */
@@ -850,6 +889,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
850 /* Swap out the handler */ 889 /* Swap out the handler */
851 newxprt->sc_cm_id->event_handler = rdma_cma_handler; 890 newxprt->sc_cm_id->event_handler = rdma_cma_handler;
852 891
892 /*
893 * Arm the CQs for the SQ and RQ before accepting so we can't
894 * miss the first message
895 */
896 ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
897 ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
898
853 /* Accept Connection */ 899 /* Accept Connection */
854 set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); 900 set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
855 memset(&conn_param, 0, sizeof conn_param); 901 memset(&conn_param, 0, sizeof conn_param);
@@ -886,58 +932,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
886 newxprt->sc_max_requests, 932 newxprt->sc_max_requests,
887 newxprt->sc_ord); 933 newxprt->sc_ord);
888 934
889 /* Set the local and remote addresses in the transport */
890 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
891 svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
892 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
893 svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
894
895 ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
896 ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
897 return &newxprt->sc_xprt; 935 return &newxprt->sc_xprt;
898 936
899 errout: 937 errout:
900 dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); 938 dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
901 /* Take a reference in case the DTO handler runs */ 939 /* Take a reference in case the DTO handler runs */
902 svc_xprt_get(&newxprt->sc_xprt); 940 svc_xprt_get(&newxprt->sc_xprt);
903 if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) { 941 if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
904 ib_destroy_qp(newxprt->sc_qp); 942 ib_destroy_qp(newxprt->sc_qp);
905 svc_xprt_put(&newxprt->sc_xprt);
906 }
907 rdma_destroy_id(newxprt->sc_cm_id); 943 rdma_destroy_id(newxprt->sc_cm_id);
908 /* This call to put will destroy the transport */ 944 /* This call to put will destroy the transport */
909 svc_xprt_put(&newxprt->sc_xprt); 945 svc_xprt_put(&newxprt->sc_xprt);
910 return NULL; 946 return NULL;
911} 947}
912 948
913/*
914 * Post an RQ WQE to the RQ when the rqst is being released. This
915 * effectively returns an RQ credit to the client. The rq_xprt_ctxt
916 * will be null if the request is deferred due to an RDMA_READ or the
917 * transport had no data ready (EAGAIN). Note that an RPC deferred in
918 * svc_process will still return the credit, this is because the data
919 * is copied and no longer consume a WQE/WC.
920 */
921static void svc_rdma_release_rqst(struct svc_rqst *rqstp) 949static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
922{ 950{
923 int err;
924 struct svcxprt_rdma *rdma =
925 container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
926 if (rqstp->rq_xprt_ctxt) {
927 BUG_ON(rqstp->rq_xprt_ctxt != rdma);
928 err = svc_rdma_post_recv(rdma);
929 if (err)
930 dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
931 err);
932 }
933 rqstp->rq_xprt_ctxt = NULL;
934} 951}
935 952
936/* 953/*
937 * When connected, an svc_xprt has at least three references: 954 * When connected, an svc_xprt has at least two references:
938 *
939 * - A reference held by the QP. We still hold that here because this
940 * code deletes the QP and puts the reference.
941 * 955 *
942 * - A reference held by the cm_id between the ESTABLISHED and 956 * - A reference held by the cm_id between the ESTABLISHED and
943 * DISCONNECTED events. If the remote peer disconnected first, this 957 * DISCONNECTED events. If the remote peer disconnected first, this
@@ -946,7 +960,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
946 * - A reference held by the svc_recv code that called this function 960 * - A reference held by the svc_recv code that called this function
947 * as part of close processing. 961 * as part of close processing.
948 * 962 *
949 * At a minimum two references should still be held. 963 * At a minimum one references should still be held.
950 */ 964 */
951static void svc_rdma_detach(struct svc_xprt *xprt) 965static void svc_rdma_detach(struct svc_xprt *xprt)
952{ 966{
@@ -956,23 +970,53 @@ static void svc_rdma_detach(struct svc_xprt *xprt)
956 970
957 /* Disconnect and flush posted WQE */ 971 /* Disconnect and flush posted WQE */
958 rdma_disconnect(rdma->sc_cm_id); 972 rdma_disconnect(rdma->sc_cm_id);
959
960 /* Destroy the QP if present (not a listener) */
961 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) {
962 ib_destroy_qp(rdma->sc_qp);
963 svc_xprt_put(xprt);
964 }
965
966 /* Destroy the CM ID */
967 rdma_destroy_id(rdma->sc_cm_id);
968} 973}
969 974
970static void svc_rdma_free(struct svc_xprt *xprt) 975static void __svc_rdma_free(struct work_struct *work)
971{ 976{
972 struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; 977 struct svcxprt_rdma *rdma =
978 container_of(work, struct svcxprt_rdma, sc_work);
973 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); 979 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
980
974 /* We should only be called from kref_put */ 981 /* We should only be called from kref_put */
975 BUG_ON(atomic_read(&xprt->xpt_ref.refcount) != 0); 982 BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
983
984 /*
985 * Destroy queued, but not processed read completions. Note
986 * that this cleanup has to be done before destroying the
987 * cm_id because the device ptr is needed to unmap the dma in
988 * svc_rdma_put_context.
989 */
990 spin_lock_bh(&rdma->sc_read_complete_lock);
991 while (!list_empty(&rdma->sc_read_complete_q)) {
992 struct svc_rdma_op_ctxt *ctxt;
993 ctxt = list_entry(rdma->sc_read_complete_q.next,
994 struct svc_rdma_op_ctxt,
995 dto_q);
996 list_del_init(&ctxt->dto_q);
997 svc_rdma_put_context(ctxt, 1);
998 }
999 spin_unlock_bh(&rdma->sc_read_complete_lock);
1000
1001 /* Destroy queued, but not processed recv completions */
1002 spin_lock_bh(&rdma->sc_rq_dto_lock);
1003 while (!list_empty(&rdma->sc_rq_dto_q)) {
1004 struct svc_rdma_op_ctxt *ctxt;
1005 ctxt = list_entry(rdma->sc_rq_dto_q.next,
1006 struct svc_rdma_op_ctxt,
1007 dto_q);
1008 list_del_init(&ctxt->dto_q);
1009 svc_rdma_put_context(ctxt, 1);
1010 }
1011 spin_unlock_bh(&rdma->sc_rq_dto_lock);
1012
1013 /* Warn if we leaked a resource or under-referenced */
1014 WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
1015
1016 /* Destroy the QP if present (not a listener) */
1017 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
1018 ib_destroy_qp(rdma->sc_qp);
1019
976 if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) 1020 if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
977 ib_destroy_cq(rdma->sc_sq_cq); 1021 ib_destroy_cq(rdma->sc_sq_cq);
978 1022
@@ -985,10 +1029,21 @@ static void svc_rdma_free(struct svc_xprt *xprt)
985 if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) 1029 if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
986 ib_dealloc_pd(rdma->sc_pd); 1030 ib_dealloc_pd(rdma->sc_pd);
987 1031
988 destroy_context_cache(rdma->sc_ctxt_head); 1032 /* Destroy the CM ID */
1033 rdma_destroy_id(rdma->sc_cm_id);
1034
1035 destroy_context_cache(rdma);
989 kfree(rdma); 1036 kfree(rdma);
990} 1037}
991 1038
1039static void svc_rdma_free(struct svc_xprt *xprt)
1040{
1041 struct svcxprt_rdma *rdma =
1042 container_of(xprt, struct svcxprt_rdma, sc_xprt);
1043 INIT_WORK(&rdma->sc_work, __svc_rdma_free);
1044 schedule_work(&rdma->sc_work);
1045}
1046
992static int svc_rdma_has_wspace(struct svc_xprt *xprt) 1047static int svc_rdma_has_wspace(struct svc_xprt *xprt)
993{ 1048{
994 struct svcxprt_rdma *rdma = 1049 struct svcxprt_rdma *rdma =
@@ -1018,7 +1073,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1018 int ret; 1073 int ret;
1019 1074
1020 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1075 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1021 return 0; 1076 return -ENOTCONN;
1022 1077
1023 BUG_ON(wr->send_flags != IB_SEND_SIGNALED); 1078 BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
1024 BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != 1079 BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
@@ -1029,7 +1084,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1029 if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { 1084 if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
1030 spin_unlock_bh(&xprt->sc_lock); 1085 spin_unlock_bh(&xprt->sc_lock);
1031 atomic_inc(&rdma_stat_sq_starve); 1086 atomic_inc(&rdma_stat_sq_starve);
1032 /* See if we can reap some SQ WR */ 1087
1088 /* See if we can opportunistically reap SQ WR to make room */
1033 sq_cq_reap(xprt); 1089 sq_cq_reap(xprt);
1034 1090
1035 /* Wait until SQ WR available if SQ still full */ 1091 /* Wait until SQ WR available if SQ still full */
@@ -1041,22 +1097,25 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
1041 continue; 1097 continue;
1042 } 1098 }
1043 /* Bumped used SQ WR count and post */ 1099 /* Bumped used SQ WR count and post */
1100 svc_xprt_get(&xprt->sc_xprt);
1044 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); 1101 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1045 if (!ret) 1102 if (!ret)
1046 atomic_inc(&xprt->sc_sq_count); 1103 atomic_inc(&xprt->sc_sq_count);
1047 else 1104 else {
1105 svc_xprt_put(&xprt->sc_xprt);
1048 dprintk("svcrdma: failed to post SQ WR rc=%d, " 1106 dprintk("svcrdma: failed to post SQ WR rc=%d, "
1049 "sc_sq_count=%d, sc_sq_depth=%d\n", 1107 "sc_sq_count=%d, sc_sq_depth=%d\n",
1050 ret, atomic_read(&xprt->sc_sq_count), 1108 ret, atomic_read(&xprt->sc_sq_count),
1051 xprt->sc_sq_depth); 1109 xprt->sc_sq_depth);
1110 }
1052 spin_unlock_bh(&xprt->sc_lock); 1111 spin_unlock_bh(&xprt->sc_lock);
1053 break; 1112 break;
1054 } 1113 }
1055 return ret; 1114 return ret;
1056} 1115}
1057 1116
1058int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, 1117void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1059 enum rpcrdma_errcode err) 1118 enum rpcrdma_errcode err)
1060{ 1119{
1061 struct ib_send_wr err_wr; 1120 struct ib_send_wr err_wr;
1062 struct ib_sge sge; 1121 struct ib_sge sge;
@@ -1094,9 +1153,8 @@ int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1094 /* Post It */ 1153 /* Post It */
1095 ret = svc_rdma_send(xprt, &err_wr); 1154 ret = svc_rdma_send(xprt, &err_wr);
1096 if (ret) { 1155 if (ret) {
1097 dprintk("svcrdma: Error posting send = %d\n", ret); 1156 dprintk("svcrdma: Error %d posting send for protocol error\n",
1157 ret);
1098 svc_rdma_put_context(ctxt, 1); 1158 svc_rdma_put_context(ctxt, 1);
1099 } 1159 }
1100
1101 return ret;
1102} 1160}