aboutsummaryrefslogtreecommitdiffstats
path: root/net/sunrpc/xprtrdma
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-10 18:02:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-10 18:02:42 -0400
commitd1e1cda862c16252087374ac75949b0e89a5717e (patch)
tree544ce467bed23638949a1991b4f7b00e7472baa4 /net/sunrpc/xprtrdma
parent07888238f55056605cd23aa4ea3ca97d5e15938f (diff)
parenta914722f333b3359d2f4f12919380a334176bb89 (diff)
Merge tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: - massive cleanup of the NFS read/write code by Anna and Dros - support multiple NFS read/write requests per page in order to deal with non-page aligned pNFS striping. Also cleans up the r/wsize < page size code nicely. - stable fix for ensuring inode is declared uptodate only after all the attributes have been checked. - stable fix for a kernel Oops when remounting - NFS over RDMA client fixes - move the pNFS files layout driver into its own subdirectory" * tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (79 commits) NFS: populate ->net in mount data when remounting pnfs: fix lockup caused by pnfs_generic_pg_test NFSv4.1: Fix typo in dprintk NFSv4.1: Comment is now wrong and redundant to code NFS: Use raw_write_seqcount_begin/end int nfs4_reclaim_open_state xprtrdma: Disconnect on registration failure xprtrdma: Remove BUG_ON() call sites xprtrdma: Avoid deadlock when credit window is reset SUNRPC: Move congestion window constants to header file xprtrdma: Reset connection timeout after successful reconnect xprtrdma: Use macros for reconnection timeout constants xprtrdma: Allocate missing pagelist xprtrdma: Remove Tavor MTU setting xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting xprtrdma: Reduce the number of hardway buffer allocations xprtrdma: Limit work done by completion handler xprtrmda: Reduce calls to ib_poll_cq() in completion handlers xprtrmda: Reduce lock contention in completion handlers xprtrdma: Split the completion queue xprtrdma: Make rpcrdma_ep_destroy() return void ...
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c119
-rw-r--r--net/sunrpc/xprtrdma/transport.c90
-rw-r--r--net/sunrpc/xprtrdma/verbs.c753
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h17
4 files changed, 396 insertions, 583 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 96ead526b125..693966d3f33b 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -78,8 +78,7 @@ static const char transfertypes[][12] = {
78 * elements. Segments are then coalesced when registered, if possible 78 * elements. Segments are then coalesced when registered, if possible
79 * within the selected memreg mode. 79 * within the selected memreg mode.
80 * 80 *
81 * Note, this routine is never called if the connection's memory 81 * Returns positive number of segments converted, or a negative errno.
82 * registration strategy is 0 (bounce buffers).
83 */ 82 */
84 83
85static int 84static int
@@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
102 page_base = xdrbuf->page_base & ~PAGE_MASK; 101 page_base = xdrbuf->page_base & ~PAGE_MASK;
103 p = 0; 102 p = 0;
104 while (len && n < nsegs) { 103 while (len && n < nsegs) {
104 if (!ppages[p]) {
105 /* alloc the pagelist for receiving buffer */
106 ppages[p] = alloc_page(GFP_ATOMIC);
107 if (!ppages[p])
108 return -ENOMEM;
109 }
105 seg[n].mr_page = ppages[p]; 110 seg[n].mr_page = ppages[p];
106 seg[n].mr_offset = (void *)(unsigned long) page_base; 111 seg[n].mr_offset = (void *)(unsigned long) page_base;
107 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); 112 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
108 BUG_ON(seg[n].mr_len > PAGE_SIZE); 113 if (seg[n].mr_len > PAGE_SIZE)
114 return -EIO;
109 len -= seg[n].mr_len; 115 len -= seg[n].mr_len;
110 ++n; 116 ++n;
111 ++p; 117 ++p;
@@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
114 120
115 /* Message overflows the seg array */ 121 /* Message overflows the seg array */
116 if (len && n == nsegs) 122 if (len && n == nsegs)
117 return 0; 123 return -EIO;
118 124
119 if (xdrbuf->tail[0].iov_len) { 125 if (xdrbuf->tail[0].iov_len) {
120 /* the rpcrdma protocol allows us to omit any trailing 126 /* the rpcrdma protocol allows us to omit any trailing
@@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
123 return n; 129 return n;
124 if (n == nsegs) 130 if (n == nsegs)
125 /* Tail remains, but we're out of segments */ 131 /* Tail remains, but we're out of segments */
126 return 0; 132 return -EIO;
127 seg[n].mr_page = NULL; 133 seg[n].mr_page = NULL;
128 seg[n].mr_offset = xdrbuf->tail[0].iov_base; 134 seg[n].mr_offset = xdrbuf->tail[0].iov_base;
129 seg[n].mr_len = xdrbuf->tail[0].iov_len; 135 seg[n].mr_len = xdrbuf->tail[0].iov_len;
@@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
164 * Reply chunk (a counted array): 170 * Reply chunk (a counted array):
165 * N elements: 171 * N elements:
166 * 1 - N - HLOO - HLOO - ... - HLOO 172 * 1 - N - HLOO - HLOO - ... - HLOO
173 *
174 * Returns positive RPC/RDMA header size, or negative errno.
167 */ 175 */
168 176
169static unsigned int 177static ssize_t
170rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, 178rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
171 struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) 179 struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
172{ 180{
173 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 181 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
174 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 182 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
175 int nsegs, nchunks = 0; 183 int n, nsegs, nchunks = 0;
176 unsigned int pos; 184 unsigned int pos;
177 struct rpcrdma_mr_seg *seg = req->rl_segments; 185 struct rpcrdma_mr_seg *seg = req->rl_segments;
178 struct rpcrdma_read_chunk *cur_rchunk = NULL; 186 struct rpcrdma_read_chunk *cur_rchunk = NULL;
@@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
198 pos = target->head[0].iov_len; 206 pos = target->head[0].iov_len;
199 207
200 nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); 208 nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
201 if (nsegs == 0) 209 if (nsegs < 0)
202 return 0; 210 return nsegs;
203 211
204 do { 212 do {
205 /* bind/register the memory, then build chunk from result. */ 213 n = rpcrdma_register_external(seg, nsegs,
206 int n = rpcrdma_register_external(seg, nsegs,
207 cur_wchunk != NULL, r_xprt); 214 cur_wchunk != NULL, r_xprt);
208 if (n <= 0) 215 if (n <= 0)
209 goto out; 216 goto out;
@@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
248 /* success. all failures return above */ 255 /* success. all failures return above */
249 req->rl_nchunks = nchunks; 256 req->rl_nchunks = nchunks;
250 257
251 BUG_ON(nchunks == 0);
252 BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
253 && (nchunks > 3));
254
255 /* 258 /*
256 * finish off header. If write, marshal discrim and nchunks. 259 * finish off header. If write, marshal discrim and nchunks.
257 */ 260 */
@@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
278out: 281out:
279 for (pos = 0; nchunks--;) 282 for (pos = 0; nchunks--;)
280 pos += rpcrdma_deregister_external( 283 pos += rpcrdma_deregister_external(
281 &req->rl_segments[pos], r_xprt, NULL); 284 &req->rl_segments[pos], r_xprt);
282 return 0; 285 return n;
283} 286}
284 287
285/* 288/*
@@ -361,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
361 * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. 364 * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
362 * [2] -- optional padding. 365 * [2] -- optional padding.
363 * [3] -- if padded, header only in [1] and data here. 366 * [3] -- if padded, header only in [1] and data here.
367 *
368 * Returns zero on success, otherwise a negative errno.
364 */ 369 */
365 370
366int 371int
@@ -370,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
370 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 375 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
371 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 376 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
372 char *base; 377 char *base;
373 size_t hdrlen, rpclen, padlen; 378 size_t rpclen, padlen;
379 ssize_t hdrlen;
374 enum rpcrdma_chunktype rtype, wtype; 380 enum rpcrdma_chunktype rtype, wtype;
375 struct rpcrdma_msg *headerp; 381 struct rpcrdma_msg *headerp;
376 382
@@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
441 /* The following simplification is not true forever */ 447 /* The following simplification is not true forever */
442 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) 448 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
443 wtype = rpcrdma_noch; 449 wtype = rpcrdma_noch;
444 BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); 450 if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
445 451 dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
446 if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && 452 __func__);
447 (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { 453 return -EIO;
448 /* forced to "pure inline"? */
449 dprintk("RPC: %s: too much data (%d/%d) for inline\n",
450 __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
451 return -1;
452 } 454 }
453 455
454 hdrlen = 28; /*sizeof *headerp;*/ 456 hdrlen = 28; /*sizeof *headerp;*/
@@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
474 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 476 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
475 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 477 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
476 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 478 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
477 BUG_ON(wtype != rpcrdma_noch); 479 if (wtype != rpcrdma_noch) {
478 480 dprintk("RPC: %s: invalid chunk list\n",
481 __func__);
482 return -EIO;
483 }
479 } else { 484 } else {
480 headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; 485 headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
481 headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; 486 headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
@@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
492 * on receive. Therefore, we request a reply chunk 497 * on receive. Therefore, we request a reply chunk
493 * for non-writes wherever feasible and efficient. 498 * for non-writes wherever feasible and efficient.
494 */ 499 */
495 if (wtype == rpcrdma_noch && 500 if (wtype == rpcrdma_noch)
496 r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
497 wtype = rpcrdma_replych; 501 wtype = rpcrdma_replych;
498 } 502 }
499 } 503 }
@@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
511 hdrlen = rpcrdma_create_chunks(rqst, 515 hdrlen = rpcrdma_create_chunks(rqst,
512 &rqst->rq_rcv_buf, headerp, wtype); 516 &rqst->rq_rcv_buf, headerp, wtype);
513 } 517 }
514 518 if (hdrlen < 0)
515 if (hdrlen == 0) 519 return hdrlen;
516 return -1;
517 520
518 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 521 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
519 " headerp 0x%p base 0x%p lkey 0x%x\n", 522 " headerp 0x%p base 0x%p lkey 0x%x\n",
@@ -680,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
680 rqst->rq_private_buf = rqst->rq_rcv_buf; 683 rqst->rq_private_buf = rqst->rq_rcv_buf;
681} 684}
682 685
683/*
684 * This function is called when an async event is posted to
685 * the connection which changes the connection state. All it
686 * does at this point is mark the connection up/down, the rpc
687 * timers do the rest.
688 */
689void 686void
690rpcrdma_conn_func(struct rpcrdma_ep *ep) 687rpcrdma_connect_worker(struct work_struct *work)
691{ 688{
689 struct rpcrdma_ep *ep =
690 container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
692 struct rpc_xprt *xprt = ep->rep_xprt; 691 struct rpc_xprt *xprt = ep->rep_xprt;
693 692
694 spin_lock_bh(&xprt->transport_lock); 693 spin_lock_bh(&xprt->transport_lock);
@@ -705,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
705} 704}
706 705
707/* 706/*
708 * This function is called when memory window unbind which we are waiting 707 * This function is called when an async event is posted to
709 * for completes. Just use rr_func (zeroed by upcall) to signal completion. 708 * the connection which changes the connection state. All it
709 * does at this point is mark the connection up/down, the rpc
710 * timers do the rest.
710 */ 711 */
711static void 712void
712rpcrdma_unbind_func(struct rpcrdma_rep *rep) 713rpcrdma_conn_func(struct rpcrdma_ep *ep)
713{ 714{
714 wake_up(&rep->rr_unbind); 715 schedule_delayed_work(&ep->rep_connect_worker, 0);
715} 716}
716 717
717/* 718/*
@@ -728,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
728 struct rpc_xprt *xprt = rep->rr_xprt; 729 struct rpc_xprt *xprt = rep->rr_xprt;
729 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 730 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
730 __be32 *iptr; 731 __be32 *iptr;
731 int i, rdmalen, status; 732 int rdmalen, status;
733 unsigned long cwnd;
732 734
733 /* Check status. If bad, signal disconnect and return rep to pool */ 735 /* Check status. If bad, signal disconnect and return rep to pool */
734 if (rep->rr_len == ~0U) { 736 if (rep->rr_len == ~0U) {
@@ -783,6 +785,7 @@ repost:
783 785
784 /* from here on, the reply is no longer an orphan */ 786 /* from here on, the reply is no longer an orphan */
785 req->rl_reply = rep; 787 req->rl_reply = rep;
788 xprt->reestablish_timeout = 0;
786 789
787 /* check for expected message types */ 790 /* check for expected message types */
788 /* The order of some of these tests is important. */ 791 /* The order of some of these tests is important. */
@@ -857,26 +860,10 @@ badheader:
857 break; 860 break;
858 } 861 }
859 862
860 /* If using mw bind, start the deregister process now. */ 863 cwnd = xprt->cwnd;
861 /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ 864 xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
862 if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { 865 if (xprt->cwnd > cwnd)
863 case RPCRDMA_MEMWINDOWS: 866 xprt_release_rqst_cong(rqst->rq_task);
864 for (i = 0; req->rl_nchunks-- > 1;)
865 i += rpcrdma_deregister_external(
866 &req->rl_segments[i], r_xprt, NULL);
867 /* Optionally wait (not here) for unbinds to complete */
868 rep->rr_func = rpcrdma_unbind_func;
869 (void) rpcrdma_deregister_external(&req->rl_segments[i],
870 r_xprt, rep);
871 break;
872 case RPCRDMA_MEMWINDOWS_ASYNC:
873 for (i = 0; req->rl_nchunks--;)
874 i += rpcrdma_deregister_external(&req->rl_segments[i],
875 r_xprt, NULL);
876 break;
877 default:
878 break;
879 }
880 867
881 dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", 868 dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
882 __func__, xprt, rqst, status); 869 __func__, xprt, rqst, status);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 1eb9c468d0c9..66f91f0d071a 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = {
149 149
150#endif 150#endif
151 151
152#define RPCRDMA_BIND_TO (60U * HZ)
153#define RPCRDMA_INIT_REEST_TO (5U * HZ)
154#define RPCRDMA_MAX_REEST_TO (30U * HZ)
155#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
156
152static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ 157static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
153 158
154static void 159static void
@@ -229,7 +234,6 @@ static void
229xprt_rdma_destroy(struct rpc_xprt *xprt) 234xprt_rdma_destroy(struct rpc_xprt *xprt)
230{ 235{
231 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 236 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
232 int rc;
233 237
234 dprintk("RPC: %s: called\n", __func__); 238 dprintk("RPC: %s: called\n", __func__);
235 239
@@ -238,10 +242,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
238 xprt_clear_connected(xprt); 242 xprt_clear_connected(xprt);
239 243
240 rpcrdma_buffer_destroy(&r_xprt->rx_buf); 244 rpcrdma_buffer_destroy(&r_xprt->rx_buf);
241 rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); 245 rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
242 if (rc)
243 dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
244 __func__, rc);
245 rpcrdma_ia_close(&r_xprt->rx_ia); 246 rpcrdma_ia_close(&r_xprt->rx_ia);
246 247
247 xprt_rdma_free_addresses(xprt); 248 xprt_rdma_free_addresses(xprt);
@@ -289,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args)
289 290
290 /* 60 second timeout, no retries */ 291 /* 60 second timeout, no retries */
291 xprt->timeout = &xprt_rdma_default_timeout; 292 xprt->timeout = &xprt_rdma_default_timeout;
292 xprt->bind_timeout = (60U * HZ); 293 xprt->bind_timeout = RPCRDMA_BIND_TO;
293 xprt->reestablish_timeout = (5U * HZ); 294 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
294 xprt->idle_timeout = (5U * 60 * HZ); 295 xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
295 296
296 xprt->resvport = 0; /* privileged port not needed */ 297 xprt->resvport = 0; /* privileged port not needed */
297 xprt->tsh_size = 0; /* RPC-RDMA handles framing */ 298 xprt->tsh_size = 0; /* RPC-RDMA handles framing */
@@ -391,7 +392,7 @@ out4:
391 xprt_rdma_free_addresses(xprt); 392 xprt_rdma_free_addresses(xprt);
392 rc = -EINVAL; 393 rc = -EINVAL;
393out3: 394out3:
394 (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); 395 rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
395out2: 396out2:
396 rpcrdma_ia_close(&new_xprt->rx_ia); 397 rpcrdma_ia_close(&new_xprt->rx_ia);
397out1: 398out1:
@@ -436,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
436 schedule_delayed_work(&r_xprt->rdma_connect, 437 schedule_delayed_work(&r_xprt->rdma_connect,
437 xprt->reestablish_timeout); 438 xprt->reestablish_timeout);
438 xprt->reestablish_timeout <<= 1; 439 xprt->reestablish_timeout <<= 1;
439 if (xprt->reestablish_timeout > (30 * HZ)) 440 if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
440 xprt->reestablish_timeout = (30 * HZ); 441 xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
441 else if (xprt->reestablish_timeout < (5 * HZ)) 442 else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
442 xprt->reestablish_timeout = (5 * HZ); 443 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
443 } else { 444 } else {
444 schedule_delayed_work(&r_xprt->rdma_connect, 0); 445 schedule_delayed_work(&r_xprt->rdma_connect, 0);
445 if (!RPC_IS_ASYNC(task)) 446 if (!RPC_IS_ASYNC(task))
@@ -447,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
447 } 448 }
448} 449}
449 450
450static int
451xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
452{
453 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
454 int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
455
456 /* == RPC_CWNDSCALE @ init, but *after* setup */
457 if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
458 r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
459 dprintk("RPC: %s: cwndscale %lu\n", __func__,
460 r_xprt->rx_buf.rb_cwndscale);
461 BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
462 }
463 xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
464 return xprt_reserve_xprt_cong(xprt, task);
465}
466
467/* 451/*
468 * The RDMA allocate/free functions need the task structure as a place 452 * The RDMA allocate/free functions need the task structure as a place
469 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv 453 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
@@ -479,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
479 struct rpcrdma_req *req, *nreq; 463 struct rpcrdma_req *req, *nreq;
480 464
481 req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); 465 req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
482 BUG_ON(NULL == req); 466 if (req == NULL)
467 return NULL;
483 468
484 if (size > req->rl_size) { 469 if (size > req->rl_size) {
485 dprintk("RPC: %s: size %zd too large for buffer[%zd]: " 470 dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
@@ -503,18 +488,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
503 * If the allocation or registration fails, the RPC framework 488 * If the allocation or registration fails, the RPC framework
504 * will (doggedly) retry. 489 * will (doggedly) retry.
505 */ 490 */
506 if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
507 RPCRDMA_BOUNCEBUFFERS) {
508 /* forced to "pure inline" */
509 dprintk("RPC: %s: too much data (%zd) for inline "
510 "(r/w max %d/%d)\n", __func__, size,
511 rpcx_to_rdmad(xprt).inline_rsize,
512 rpcx_to_rdmad(xprt).inline_wsize);
513 size = req->rl_size;
514 rpc_exit(task, -EIO); /* fail the operation */
515 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
516 goto out;
517 }
518 if (task->tk_flags & RPC_TASK_SWAPPER) 491 if (task->tk_flags & RPC_TASK_SWAPPER)
519 nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); 492 nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
520 else 493 else
@@ -543,7 +516,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
543 req = nreq; 516 req = nreq;
544 } 517 }
545 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); 518 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
546out:
547 req->rl_connect_cookie = 0; /* our reserved value */ 519 req->rl_connect_cookie = 0; /* our reserved value */
548 return req->rl_xdr_buf; 520 return req->rl_xdr_buf;
549 521
@@ -579,9 +551,7 @@ xprt_rdma_free(void *buffer)
579 __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); 551 __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
580 552
581 /* 553 /*
582 * Finish the deregistration. When using mw bind, this was 554 * Finish the deregistration. The process is considered
583 * begun in rpcrdma_reply_handler(). In all other modes, we
584 * do it here, in thread context. The process is considered
585 * complete when the rr_func vector becomes NULL - this 555 * complete when the rr_func vector becomes NULL - this
586 * was put in place during rpcrdma_reply_handler() - the wait 556 * was put in place during rpcrdma_reply_handler() - the wait
587 * call below will not block if the dereg is "done". If 557 * call below will not block if the dereg is "done". If
@@ -590,12 +560,7 @@ xprt_rdma_free(void *buffer)
590 for (i = 0; req->rl_nchunks;) { 560 for (i = 0; req->rl_nchunks;) {
591 --req->rl_nchunks; 561 --req->rl_nchunks;
592 i += rpcrdma_deregister_external( 562 i += rpcrdma_deregister_external(
593 &req->rl_segments[i], r_xprt, NULL); 563 &req->rl_segments[i], r_xprt);
594 }
595
596 if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
597 rep->rr_func = NULL; /* abandon the callback */
598 req->rl_reply = NULL;
599 } 564 }
600 565
601 if (req->rl_iov.length == 0) { /* see allocate above */ 566 if (req->rl_iov.length == 0) { /* see allocate above */
@@ -630,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task)
630 struct rpc_xprt *xprt = rqst->rq_xprt; 595 struct rpc_xprt *xprt = rqst->rq_xprt;
631 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 596 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
632 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 597 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
598 int rc;
633 599
634 /* marshal the send itself */ 600 if (req->rl_niovs == 0) {
635 if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { 601 rc = rpcrdma_marshal_req(rqst);
636 r_xprt->rx_stats.failed_marshal_count++; 602 if (rc < 0)
637 dprintk("RPC: %s: rpcrdma_marshal_req failed\n", 603 goto failed_marshal;
638 __func__);
639 return -EIO;
640 } 604 }
641 605
642 if (req->rl_reply == NULL) /* e.g. reconnection */ 606 if (req->rl_reply == NULL) /* e.g. reconnection */
@@ -660,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task)
660 rqst->rq_bytes_sent = 0; 624 rqst->rq_bytes_sent = 0;
661 return 0; 625 return 0;
662 626
627failed_marshal:
628 r_xprt->rx_stats.failed_marshal_count++;
629 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
630 __func__, rc);
631 if (rc == -EIO)
632 return -EIO;
663drop_connection: 633drop_connection:
664 xprt_disconnect_done(xprt); 634 xprt_disconnect_done(xprt);
665 return -ENOTCONN; /* implies disconnect */ 635 return -ENOTCONN; /* implies disconnect */
@@ -705,7 +675,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
705 */ 675 */
706 676
707static struct rpc_xprt_ops xprt_rdma_procs = { 677static struct rpc_xprt_ops xprt_rdma_procs = {
708 .reserve_xprt = xprt_rdma_reserve_xprt, 678 .reserve_xprt = xprt_reserve_xprt_cong,
709 .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ 679 .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
710 .alloc_slot = xprt_alloc_slot, 680 .alloc_slot = xprt_alloc_slot,
711 .release_request = xprt_release_rqst_cong, /* ditto */ 681 .release_request = xprt_release_rqst_cong, /* ditto */
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 93726560eaa8..13dbd1c389ff 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -48,8 +48,8 @@
48 */ 48 */
49 49
50#include <linux/interrupt.h> 50#include <linux/interrupt.h>
51#include <linux/pci.h> /* for Tavor hack below */
52#include <linux/slab.h> 51#include <linux/slab.h>
52#include <asm/bitops.h>
53 53
54#include "xprt_rdma.h" 54#include "xprt_rdma.h"
55 55
@@ -142,98 +142,139 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
142 } 142 }
143} 143}
144 144
145static inline 145static void
146void rpcrdma_event_process(struct ib_wc *wc) 146rpcrdma_sendcq_process_wc(struct ib_wc *wc)
147{ 147{
148 struct rpcrdma_mw *frmr; 148 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
149 struct rpcrdma_rep *rep =
150 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151 149
152 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", 150 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
153 __func__, rep, wc->status, wc->opcode, wc->byte_len); 151 __func__, frmr, wc->status, wc->opcode);
154 152
155 if (!rep) /* send or bind completion that we don't care about */ 153 if (wc->wr_id == 0ULL)
156 return; 154 return;
157 155 if (wc->status != IB_WC_SUCCESS)
158 if (IB_WC_SUCCESS != wc->status) {
159 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
160 __func__, wc->opcode, wc->status);
161 rep->rr_len = ~0U;
162 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
163 rpcrdma_schedule_tasklet(rep);
164 return; 156 return;
165 }
166 157
167 switch (wc->opcode) { 158 if (wc->opcode == IB_WC_FAST_REG_MR)
168 case IB_WC_FAST_REG_MR:
169 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
170 frmr->r.frmr.state = FRMR_IS_VALID; 159 frmr->r.frmr.state = FRMR_IS_VALID;
171 break; 160 else if (wc->opcode == IB_WC_LOCAL_INV)
172 case IB_WC_LOCAL_INV:
173 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
174 frmr->r.frmr.state = FRMR_IS_INVALID; 161 frmr->r.frmr.state = FRMR_IS_INVALID;
175 break;
176 case IB_WC_RECV:
177 rep->rr_len = wc->byte_len;
178 ib_dma_sync_single_for_cpu(
179 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
180 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
181 /* Keep (only) the most recent credits, after check validity */
182 if (rep->rr_len >= 16) {
183 struct rpcrdma_msg *p =
184 (struct rpcrdma_msg *) rep->rr_base;
185 unsigned int credits = ntohl(p->rm_credit);
186 if (credits == 0) {
187 dprintk("RPC: %s: server"
188 " dropped credits to 0!\n", __func__);
189 /* don't deadlock */
190 credits = 1;
191 } else if (credits > rep->rr_buffer->rb_max_requests) {
192 dprintk("RPC: %s: server"
193 " over-crediting: %d (%d)\n",
194 __func__, credits,
195 rep->rr_buffer->rb_max_requests);
196 credits = rep->rr_buffer->rb_max_requests;
197 }
198 atomic_set(&rep->rr_buffer->rb_credits, credits);
199 }
200 /* fall through */
201 case IB_WC_BIND_MW:
202 rpcrdma_schedule_tasklet(rep);
203 break;
204 default:
205 dprintk("RPC: %s: unexpected WC event %X\n",
206 __func__, wc->opcode);
207 break;
208 }
209} 162}
210 163
211static inline int 164static int
212rpcrdma_cq_poll(struct ib_cq *cq) 165rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
213{ 166{
214 struct ib_wc wc; 167 struct ib_wc *wcs;
215 int rc; 168 int budget, count, rc;
216 169
217 for (;;) { 170 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
218 rc = ib_poll_cq(cq, 1, &wc); 171 do {
219 if (rc < 0) { 172 wcs = ep->rep_send_wcs;
220 dprintk("RPC: %s: ib_poll_cq failed %i\n", 173
221 __func__, rc); 174 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
175 if (rc <= 0)
222 return rc; 176 return rc;
223 }
224 if (rc == 0)
225 break;
226 177
227 rpcrdma_event_process(&wc); 178 count = rc;
179 while (count-- > 0)
180 rpcrdma_sendcq_process_wc(wcs++);
181 } while (rc == RPCRDMA_POLLSIZE && --budget);
182 return 0;
183}
184
185/*
186 * Handle send, fast_reg_mr, and local_inv completions.
187 *
188 * Send events are typically suppressed and thus do not result
189 * in an upcall. Occasionally one is signaled, however. This
190 * prevents the provider's completion queue from wrapping and
191 * losing a completion.
192 */
193static void
194rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
195{
196 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
197 int rc;
198
199 rc = rpcrdma_sendcq_poll(cq, ep);
200 if (rc) {
201 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
202 __func__, rc);
203 return;
228 } 204 }
229 205
206 rc = ib_req_notify_cq(cq,
207 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
208 if (rc == 0)
209 return;
210 if (rc < 0) {
211 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
212 __func__, rc);
213 return;
214 }
215
216 rpcrdma_sendcq_poll(cq, ep);
217}
218
219static void
220rpcrdma_recvcq_process_wc(struct ib_wc *wc)
221{
222 struct rpcrdma_rep *rep =
223 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
224
225 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
226 __func__, rep, wc->status, wc->opcode, wc->byte_len);
227
228 if (wc->status != IB_WC_SUCCESS) {
229 rep->rr_len = ~0U;
230 goto out_schedule;
231 }
232 if (wc->opcode != IB_WC_RECV)
233 return;
234
235 rep->rr_len = wc->byte_len;
236 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
237 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
238
239 if (rep->rr_len >= 16) {
240 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
241 unsigned int credits = ntohl(p->rm_credit);
242
243 if (credits == 0)
244 credits = 1; /* don't deadlock */
245 else if (credits > rep->rr_buffer->rb_max_requests)
246 credits = rep->rr_buffer->rb_max_requests;
247 atomic_set(&rep->rr_buffer->rb_credits, credits);
248 }
249
250out_schedule:
251 rpcrdma_schedule_tasklet(rep);
252}
253
254static int
255rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
256{
257 struct ib_wc *wcs;
258 int budget, count, rc;
259
260 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
261 do {
262 wcs = ep->rep_recv_wcs;
263
264 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
265 if (rc <= 0)
266 return rc;
267
268 count = rc;
269 while (count-- > 0)
270 rpcrdma_recvcq_process_wc(wcs++);
271 } while (rc == RPCRDMA_POLLSIZE && --budget);
230 return 0; 272 return 0;
231} 273}
232 274
233/* 275/*
234 * rpcrdma_cq_event_upcall 276 * Handle receive completions.
235 * 277 *
236 * This upcall handles recv, send, bind and unbind events.
237 * It is reentrant but processes single events in order to maintain 278 * It is reentrant but processes single events in order to maintain
238 * ordering of receives to keep server credits. 279 * ordering of receives to keep server credits.
239 * 280 *
@@ -242,26 +283,31 @@ rpcrdma_cq_poll(struct ib_cq *cq)
242 * connection shutdown. That is, the structures required for 283 * connection shutdown. That is, the structures required for
243 * the completion of the reply handler must remain intact until 284 * the completion of the reply handler must remain intact until
244 * all memory has been reclaimed. 285 * all memory has been reclaimed.
245 *
246 * Note that send events are suppressed and do not result in an upcall.
247 */ 286 */
248static void 287static void
249rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) 288rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
250{ 289{
290 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
251 int rc; 291 int rc;
252 292
253 rc = rpcrdma_cq_poll(cq); 293 rc = rpcrdma_recvcq_poll(cq, ep);
254 if (rc) 294 if (rc) {
295 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
296 __func__, rc);
255 return; 297 return;
298 }
256 299
257 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 300 rc = ib_req_notify_cq(cq,
258 if (rc) { 301 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
259 dprintk("RPC: %s: ib_req_notify_cq failed %i\n", 302 if (rc == 0)
303 return;
304 if (rc < 0) {
305 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
260 __func__, rc); 306 __func__, rc);
261 return; 307 return;
262 } 308 }
263 309
264 rpcrdma_cq_poll(cq); 310 rpcrdma_recvcq_poll(cq, ep);
265} 311}
266 312
267#ifdef RPC_DEBUG 313#ifdef RPC_DEBUG
@@ -493,54 +539,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
493 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; 539 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494 } 540 }
495 541
496 switch (memreg) { 542 if (memreg == RPCRDMA_FRMR) {
497 case RPCRDMA_MEMWINDOWS:
498 case RPCRDMA_MEMWINDOWS_ASYNC:
499 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
500 dprintk("RPC: %s: MEMWINDOWS registration "
501 "specified but not supported by adapter, "
502 "using slower RPCRDMA_REGISTER\n",
503 __func__);
504 memreg = RPCRDMA_REGISTER;
505 }
506 break;
507 case RPCRDMA_MTHCAFMR:
508 if (!ia->ri_id->device->alloc_fmr) {
509#if RPCRDMA_PERSISTENT_REGISTRATION
510 dprintk("RPC: %s: MTHCAFMR registration "
511 "specified but not supported by adapter, "
512 "using riskier RPCRDMA_ALLPHYSICAL\n",
513 __func__);
514 memreg = RPCRDMA_ALLPHYSICAL;
515#else
516 dprintk("RPC: %s: MTHCAFMR registration "
517 "specified but not supported by adapter, "
518 "using slower RPCRDMA_REGISTER\n",
519 __func__);
520 memreg = RPCRDMA_REGISTER;
521#endif
522 }
523 break;
524 case RPCRDMA_FRMR:
525 /* Requires both frmr reg and local dma lkey */ 543 /* Requires both frmr reg and local dma lkey */
526 if ((devattr.device_cap_flags & 544 if ((devattr.device_cap_flags &
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 545 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
528 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 546 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
529#if RPCRDMA_PERSISTENT_REGISTRATION
530 dprintk("RPC: %s: FRMR registration " 547 dprintk("RPC: %s: FRMR registration "
531 "specified but not supported by adapter, " 548 "not supported by HCA\n", __func__);
532 "using riskier RPCRDMA_ALLPHYSICAL\n", 549 memreg = RPCRDMA_MTHCAFMR;
533 __func__); 550 } else {
551 /* Mind the ia limit on FRMR page list depth */
552 ia->ri_max_frmr_depth = min_t(unsigned int,
553 RPCRDMA_MAX_DATA_SEGS,
554 devattr.max_fast_reg_page_list_len);
555 }
556 }
557 if (memreg == RPCRDMA_MTHCAFMR) {
558 if (!ia->ri_id->device->alloc_fmr) {
559 dprintk("RPC: %s: MTHCAFMR registration "
560 "not supported by HCA\n", __func__);
561#if RPCRDMA_PERSISTENT_REGISTRATION
534 memreg = RPCRDMA_ALLPHYSICAL; 562 memreg = RPCRDMA_ALLPHYSICAL;
535#else 563#else
536 dprintk("RPC: %s: FRMR registration " 564 rc = -ENOMEM;
537 "specified but not supported by adapter, " 565 goto out2;
538 "using slower RPCRDMA_REGISTER\n",
539 __func__);
540 memreg = RPCRDMA_REGISTER;
541#endif 566#endif
542 } 567 }
543 break;
544 } 568 }
545 569
546 /* 570 /*
@@ -552,8 +576,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
552 * adapter. 576 * adapter.
553 */ 577 */
554 switch (memreg) { 578 switch (memreg) {
555 case RPCRDMA_BOUNCEBUFFERS:
556 case RPCRDMA_REGISTER:
557 case RPCRDMA_FRMR: 579 case RPCRDMA_FRMR:
558 break; 580 break;
559#if RPCRDMA_PERSISTENT_REGISTRATION 581#if RPCRDMA_PERSISTENT_REGISTRATION
@@ -563,30 +585,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
563 IB_ACCESS_REMOTE_READ; 585 IB_ACCESS_REMOTE_READ;
564 goto register_setup; 586 goto register_setup;
565#endif 587#endif
566 case RPCRDMA_MEMWINDOWS_ASYNC:
567 case RPCRDMA_MEMWINDOWS:
568 mem_priv = IB_ACCESS_LOCAL_WRITE |
569 IB_ACCESS_MW_BIND;
570 goto register_setup;
571 case RPCRDMA_MTHCAFMR: 588 case RPCRDMA_MTHCAFMR:
572 if (ia->ri_have_dma_lkey) 589 if (ia->ri_have_dma_lkey)
573 break; 590 break;
574 mem_priv = IB_ACCESS_LOCAL_WRITE; 591 mem_priv = IB_ACCESS_LOCAL_WRITE;
592#if RPCRDMA_PERSISTENT_REGISTRATION
575 register_setup: 593 register_setup:
594#endif
576 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 595 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
577 if (IS_ERR(ia->ri_bind_mem)) { 596 if (IS_ERR(ia->ri_bind_mem)) {
578 printk(KERN_ALERT "%s: ib_get_dma_mr for " 597 printk(KERN_ALERT "%s: ib_get_dma_mr for "
579 "phys register failed with %lX\n\t" 598 "phys register failed with %lX\n",
580 "Will continue with degraded performance\n",
581 __func__, PTR_ERR(ia->ri_bind_mem)); 599 __func__, PTR_ERR(ia->ri_bind_mem));
582 memreg = RPCRDMA_REGISTER; 600 rc = -ENOMEM;
583 ia->ri_bind_mem = NULL; 601 goto out2;
584 } 602 }
585 break; 603 break;
586 default: 604 default:
587 printk(KERN_ERR "%s: invalid memory registration mode %d\n", 605 printk(KERN_ERR "RPC: Unsupported memory "
588 __func__, memreg); 606 "registration mode: %d\n", memreg);
589 rc = -EINVAL; 607 rc = -ENOMEM;
590 goto out2; 608 goto out2;
591 } 609 }
592 dprintk("RPC: %s: memory registration strategy is %d\n", 610 dprintk("RPC: %s: memory registration strategy is %d\n",
@@ -640,6 +658,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
640 struct rpcrdma_create_data_internal *cdata) 658 struct rpcrdma_create_data_internal *cdata)
641{ 659{
642 struct ib_device_attr devattr; 660 struct ib_device_attr devattr;
661 struct ib_cq *sendcq, *recvcq;
643 int rc, err; 662 int rc, err;
644 663
645 rc = ib_query_device(ia->ri_id->device, &devattr); 664 rc = ib_query_device(ia->ri_id->device, &devattr);
@@ -659,32 +678,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
659 ep->rep_attr.srq = NULL; 678 ep->rep_attr.srq = NULL;
660 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 679 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
661 switch (ia->ri_memreg_strategy) { 680 switch (ia->ri_memreg_strategy) {
662 case RPCRDMA_FRMR: 681 case RPCRDMA_FRMR: {
682 int depth = 7;
683
663 /* Add room for frmr register and invalidate WRs. 684 /* Add room for frmr register and invalidate WRs.
664 * 1. FRMR reg WR for head 685 * 1. FRMR reg WR for head
665 * 2. FRMR invalidate WR for head 686 * 2. FRMR invalidate WR for head
666 * 3. FRMR reg WR for pagelist 687 * 3. N FRMR reg WRs for pagelist
667 * 4. FRMR invalidate WR for pagelist 688 * 4. N FRMR invalidate WRs for pagelist
668 * 5. FRMR reg WR for tail 689 * 5. FRMR reg WR for tail
669 * 6. FRMR invalidate WR for tail 690 * 6. FRMR invalidate WR for tail
670 * 7. The RDMA_SEND WR 691 * 7. The RDMA_SEND WR
671 */ 692 */
672 ep->rep_attr.cap.max_send_wr *= 7; 693
694 /* Calculate N if the device max FRMR depth is smaller than
695 * RPCRDMA_MAX_DATA_SEGS.
696 */
697 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
698 int delta = RPCRDMA_MAX_DATA_SEGS -
699 ia->ri_max_frmr_depth;
700
701 do {
702 depth += 2; /* FRMR reg + invalidate */
703 delta -= ia->ri_max_frmr_depth;
704 } while (delta > 0);
705
706 }
707 ep->rep_attr.cap.max_send_wr *= depth;
673 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { 708 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
674 cdata->max_requests = devattr.max_qp_wr / 7; 709 cdata->max_requests = devattr.max_qp_wr / depth;
675 if (!cdata->max_requests) 710 if (!cdata->max_requests)
676 return -EINVAL; 711 return -EINVAL;
677 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; 712 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
713 depth;
678 } 714 }
679 break; 715 break;
680 case RPCRDMA_MEMWINDOWS_ASYNC: 716 }
681 case RPCRDMA_MEMWINDOWS:
682 /* Add room for mw_binds+unbinds - overkill! */
683 ep->rep_attr.cap.max_send_wr++;
684 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
685 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
686 return -EINVAL;
687 break;
688 default: 717 default:
689 break; 718 break;
690 } 719 }
@@ -705,46 +734,51 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
705 ep->rep_attr.cap.max_recv_sge); 734 ep->rep_attr.cap.max_recv_sge);
706 735
707 /* set trigger for requesting send completion */ 736 /* set trigger for requesting send completion */
708 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; 737 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
709 switch (ia->ri_memreg_strategy) {
710 case RPCRDMA_MEMWINDOWS_ASYNC:
711 case RPCRDMA_MEMWINDOWS:
712 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
713 break;
714 default:
715 break;
716 }
717 if (ep->rep_cqinit <= 2) 738 if (ep->rep_cqinit <= 2)
718 ep->rep_cqinit = 0; 739 ep->rep_cqinit = 0;
719 INIT_CQCOUNT(ep); 740 INIT_CQCOUNT(ep);
720 ep->rep_ia = ia; 741 ep->rep_ia = ia;
721 init_waitqueue_head(&ep->rep_connect_wait); 742 init_waitqueue_head(&ep->rep_connect_wait);
743 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
722 744
723 /* 745 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
724 * Create a single cq for receive dto and mw_bind (only ever 746 rpcrdma_cq_async_error_upcall, ep,
725 * care about unbind, really). Send completions are suppressed.
726 * Use single threaded tasklet upcalls to maintain ordering.
727 */
728 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
729 rpcrdma_cq_async_error_upcall, NULL,
730 ep->rep_attr.cap.max_recv_wr +
731 ep->rep_attr.cap.max_send_wr + 1, 0); 747 ep->rep_attr.cap.max_send_wr + 1, 0);
732 if (IS_ERR(ep->rep_cq)) { 748 if (IS_ERR(sendcq)) {
733 rc = PTR_ERR(ep->rep_cq); 749 rc = PTR_ERR(sendcq);
734 dprintk("RPC: %s: ib_create_cq failed: %i\n", 750 dprintk("RPC: %s: failed to create send CQ: %i\n",
735 __func__, rc); 751 __func__, rc);
736 goto out1; 752 goto out1;
737 } 753 }
738 754
739 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); 755 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
756 if (rc) {
757 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
758 __func__, rc);
759 goto out2;
760 }
761
762 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
763 rpcrdma_cq_async_error_upcall, ep,
764 ep->rep_attr.cap.max_recv_wr + 1, 0);
765 if (IS_ERR(recvcq)) {
766 rc = PTR_ERR(recvcq);
767 dprintk("RPC: %s: failed to create recv CQ: %i\n",
768 __func__, rc);
769 goto out2;
770 }
771
772 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
740 if (rc) { 773 if (rc) {
741 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 774 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
742 __func__, rc); 775 __func__, rc);
776 ib_destroy_cq(recvcq);
743 goto out2; 777 goto out2;
744 } 778 }
745 779
746 ep->rep_attr.send_cq = ep->rep_cq; 780 ep->rep_attr.send_cq = sendcq;
747 ep->rep_attr.recv_cq = ep->rep_cq; 781 ep->rep_attr.recv_cq = recvcq;
748 782
749 /* Initialize cma parameters */ 783 /* Initialize cma parameters */
750 784
@@ -754,9 +788,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
754 788
755 /* Client offers RDMA Read but does not initiate */ 789 /* Client offers RDMA Read but does not initiate */
756 ep->rep_remote_cma.initiator_depth = 0; 790 ep->rep_remote_cma.initiator_depth = 0;
757 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) 791 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
758 ep->rep_remote_cma.responder_resources = 0;
759 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
760 ep->rep_remote_cma.responder_resources = 32; 792 ep->rep_remote_cma.responder_resources = 32;
761 else 793 else
762 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 794 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
@@ -768,7 +800,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
768 return 0; 800 return 0;
769 801
770out2: 802out2:
771 err = ib_destroy_cq(ep->rep_cq); 803 err = ib_destroy_cq(sendcq);
772 if (err) 804 if (err)
773 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 805 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
774 __func__, err); 806 __func__, err);
@@ -782,11 +814,8 @@ out1:
782 * Disconnect and destroy endpoint. After this, the only 814 * Disconnect and destroy endpoint. After this, the only
783 * valid operations on the ep are to free it (if dynamically 815 * valid operations on the ep are to free it (if dynamically
784 * allocated) or re-create it. 816 * allocated) or re-create it.
785 *
786 * The caller's error handling must be sure to not leak the endpoint
787 * if this function fails.
788 */ 817 */
789int 818void
790rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 819rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
791{ 820{
792 int rc; 821 int rc;
@@ -794,6 +823,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
794 dprintk("RPC: %s: entering, connected is %d\n", 823 dprintk("RPC: %s: entering, connected is %d\n",
795 __func__, ep->rep_connected); 824 __func__, ep->rep_connected);
796 825
826 cancel_delayed_work_sync(&ep->rep_connect_worker);
827
797 if (ia->ri_id->qp) { 828 if (ia->ri_id->qp) {
798 rc = rpcrdma_ep_disconnect(ep, ia); 829 rc = rpcrdma_ep_disconnect(ep, ia);
799 if (rc) 830 if (rc)
@@ -809,13 +840,17 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
809 ep->rep_pad_mr = NULL; 840 ep->rep_pad_mr = NULL;
810 } 841 }
811 842
812 rpcrdma_clean_cq(ep->rep_cq); 843 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
813 rc = ib_destroy_cq(ep->rep_cq); 844 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
814 if (rc) 845 if (rc)
815 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 846 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
816 __func__, rc); 847 __func__, rc);
817 848
818 return rc; 849 rpcrdma_clean_cq(ep->rep_attr.send_cq);
850 rc = ib_destroy_cq(ep->rep_attr.send_cq);
851 if (rc)
852 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
853 __func__, rc);
819} 854}
820 855
821/* 856/*
@@ -831,17 +866,20 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
831 if (ep->rep_connected != 0) { 866 if (ep->rep_connected != 0) {
832 struct rpcrdma_xprt *xprt; 867 struct rpcrdma_xprt *xprt;
833retry: 868retry:
869 dprintk("RPC: %s: reconnecting...\n", __func__);
834 rc = rpcrdma_ep_disconnect(ep, ia); 870 rc = rpcrdma_ep_disconnect(ep, ia);
835 if (rc && rc != -ENOTCONN) 871 if (rc && rc != -ENOTCONN)
836 dprintk("RPC: %s: rpcrdma_ep_disconnect" 872 dprintk("RPC: %s: rpcrdma_ep_disconnect"
837 " status %i\n", __func__, rc); 873 " status %i\n", __func__, rc);
838 rpcrdma_clean_cq(ep->rep_cq); 874
875 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
876 rpcrdma_clean_cq(ep->rep_attr.send_cq);
839 877
840 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 878 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
841 id = rpcrdma_create_id(xprt, ia, 879 id = rpcrdma_create_id(xprt, ia,
842 (struct sockaddr *)&xprt->rx_data.addr); 880 (struct sockaddr *)&xprt->rx_data.addr);
843 if (IS_ERR(id)) { 881 if (IS_ERR(id)) {
844 rc = PTR_ERR(id); 882 rc = -EHOSTUNREACH;
845 goto out; 883 goto out;
846 } 884 }
847 /* TEMP TEMP TEMP - fail if new device: 885 /* TEMP TEMP TEMP - fail if new device:
@@ -855,35 +893,32 @@ retry:
855 printk("RPC: %s: can't reconnect on " 893 printk("RPC: %s: can't reconnect on "
856 "different device!\n", __func__); 894 "different device!\n", __func__);
857 rdma_destroy_id(id); 895 rdma_destroy_id(id);
858 rc = -ENETDOWN; 896 rc = -ENETUNREACH;
859 goto out; 897 goto out;
860 } 898 }
861 /* END TEMP */ 899 /* END TEMP */
900 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
901 if (rc) {
902 dprintk("RPC: %s: rdma_create_qp failed %i\n",
903 __func__, rc);
904 rdma_destroy_id(id);
905 rc = -ENETUNREACH;
906 goto out;
907 }
862 rdma_destroy_qp(ia->ri_id); 908 rdma_destroy_qp(ia->ri_id);
863 rdma_destroy_id(ia->ri_id); 909 rdma_destroy_id(ia->ri_id);
864 ia->ri_id = id; 910 ia->ri_id = id;
911 } else {
912 dprintk("RPC: %s: connecting...\n", __func__);
913 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
914 if (rc) {
915 dprintk("RPC: %s: rdma_create_qp failed %i\n",
916 __func__, rc);
917 /* do not update ep->rep_connected */
918 return -ENETUNREACH;
919 }
865 } 920 }
866 921
867 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
868 if (rc) {
869 dprintk("RPC: %s: rdma_create_qp failed %i\n",
870 __func__, rc);
871 goto out;
872 }
873
874/* XXX Tavor device performs badly with 2K MTU! */
875if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
876 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
877 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
878 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
879 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
880 struct ib_qp_attr attr = {
881 .path_mtu = IB_MTU_1024
882 };
883 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
884 }
885}
886
887 ep->rep_connected = 0; 922 ep->rep_connected = 0;
888 923
889 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 924 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -944,7 +979,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
944{ 979{
945 int rc; 980 int rc;
946 981
947 rpcrdma_clean_cq(ep->rep_cq); 982 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
983 rpcrdma_clean_cq(ep->rep_attr.send_cq);
948 rc = rdma_disconnect(ia->ri_id); 984 rc = rdma_disconnect(ia->ri_id);
949 if (!rc) { 985 if (!rc) {
950 /* returns without wait if not connected */ 986 /* returns without wait if not connected */
@@ -967,7 +1003,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
967 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) 1003 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
968{ 1004{
969 char *p; 1005 char *p;
970 size_t len; 1006 size_t len, rlen, wlen;
971 int i, rc; 1007 int i, rc;
972 struct rpcrdma_mw *r; 1008 struct rpcrdma_mw *r;
973 1009
@@ -997,11 +1033,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
997 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 1033 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
998 sizeof(struct rpcrdma_mw); 1034 sizeof(struct rpcrdma_mw);
999 break; 1035 break;
1000 case RPCRDMA_MEMWINDOWS_ASYNC:
1001 case RPCRDMA_MEMWINDOWS:
1002 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1003 sizeof(struct rpcrdma_mw);
1004 break;
1005 default: 1036 default:
1006 break; 1037 break;
1007 } 1038 }
@@ -1032,32 +1063,29 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1032 } 1063 }
1033 p += cdata->padding; 1064 p += cdata->padding;
1034 1065
1035 /*
1036 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1037 * We "cycle" the mw's in order to minimize rkey reuse,
1038 * and also reduce unbind-to-bind collision.
1039 */
1040 INIT_LIST_HEAD(&buf->rb_mws); 1066 INIT_LIST_HEAD(&buf->rb_mws);
1041 r = (struct rpcrdma_mw *)p; 1067 r = (struct rpcrdma_mw *)p;
1042 switch (ia->ri_memreg_strategy) { 1068 switch (ia->ri_memreg_strategy) {
1043 case RPCRDMA_FRMR: 1069 case RPCRDMA_FRMR:
1044 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { 1070 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1045 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, 1071 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1046 RPCRDMA_MAX_SEGS); 1072 ia->ri_max_frmr_depth);
1047 if (IS_ERR(r->r.frmr.fr_mr)) { 1073 if (IS_ERR(r->r.frmr.fr_mr)) {
1048 rc = PTR_ERR(r->r.frmr.fr_mr); 1074 rc = PTR_ERR(r->r.frmr.fr_mr);
1049 dprintk("RPC: %s: ib_alloc_fast_reg_mr" 1075 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1050 " failed %i\n", __func__, rc); 1076 " failed %i\n", __func__, rc);
1051 goto out; 1077 goto out;
1052 } 1078 }
1053 r->r.frmr.fr_pgl = 1079 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1054 ib_alloc_fast_reg_page_list(ia->ri_id->device, 1080 ia->ri_id->device,
1055 RPCRDMA_MAX_SEGS); 1081 ia->ri_max_frmr_depth);
1056 if (IS_ERR(r->r.frmr.fr_pgl)) { 1082 if (IS_ERR(r->r.frmr.fr_pgl)) {
1057 rc = PTR_ERR(r->r.frmr.fr_pgl); 1083 rc = PTR_ERR(r->r.frmr.fr_pgl);
1058 dprintk("RPC: %s: " 1084 dprintk("RPC: %s: "
1059 "ib_alloc_fast_reg_page_list " 1085 "ib_alloc_fast_reg_page_list "
1060 "failed %i\n", __func__, rc); 1086 "failed %i\n", __func__, rc);
1087
1088 ib_dereg_mr(r->r.frmr.fr_mr);
1061 goto out; 1089 goto out;
1062 } 1090 }
1063 list_add(&r->mw_list, &buf->rb_mws); 1091 list_add(&r->mw_list, &buf->rb_mws);
@@ -1082,21 +1110,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1082 ++r; 1110 ++r;
1083 } 1111 }
1084 break; 1112 break;
1085 case RPCRDMA_MEMWINDOWS_ASYNC:
1086 case RPCRDMA_MEMWINDOWS:
1087 /* Allocate one extra request's worth, for full cycling */
1088 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1089 r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1);
1090 if (IS_ERR(r->r.mw)) {
1091 rc = PTR_ERR(r->r.mw);
1092 dprintk("RPC: %s: ib_alloc_mw"
1093 " failed %i\n", __func__, rc);
1094 goto out;
1095 }
1096 list_add(&r->mw_list, &buf->rb_mws);
1097 ++r;
1098 }
1099 break;
1100 default: 1113 default:
1101 break; 1114 break;
1102 } 1115 }
@@ -1105,16 +1118,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1105 * Allocate/init the request/reply buffers. Doing this 1118 * Allocate/init the request/reply buffers. Doing this
1106 * using kmalloc for now -- one for each buf. 1119 * using kmalloc for now -- one for each buf.
1107 */ 1120 */
1121 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1122 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1123 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1124 __func__, wlen, rlen);
1125
1108 for (i = 0; i < buf->rb_max_requests; i++) { 1126 for (i = 0; i < buf->rb_max_requests; i++) {
1109 struct rpcrdma_req *req; 1127 struct rpcrdma_req *req;
1110 struct rpcrdma_rep *rep; 1128 struct rpcrdma_rep *rep;
1111 1129
1112 len = cdata->inline_wsize + sizeof(struct rpcrdma_req); 1130 req = kmalloc(wlen, GFP_KERNEL);
1113 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1114 /* Typical ~2400b, so rounding up saves work later */
1115 if (len < 4096)
1116 len = 4096;
1117 req = kmalloc(len, GFP_KERNEL);
1118 if (req == NULL) { 1131 if (req == NULL) {
1119 dprintk("RPC: %s: request buffer %d alloc" 1132 dprintk("RPC: %s: request buffer %d alloc"
1120 " failed\n", __func__, i); 1133 " failed\n", __func__, i);
@@ -1126,16 +1139,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1126 buf->rb_send_bufs[i]->rl_buffer = buf; 1139 buf->rb_send_bufs[i]->rl_buffer = buf;
1127 1140
1128 rc = rpcrdma_register_internal(ia, req->rl_base, 1141 rc = rpcrdma_register_internal(ia, req->rl_base,
1129 len - offsetof(struct rpcrdma_req, rl_base), 1142 wlen - offsetof(struct rpcrdma_req, rl_base),
1130 &buf->rb_send_bufs[i]->rl_handle, 1143 &buf->rb_send_bufs[i]->rl_handle,
1131 &buf->rb_send_bufs[i]->rl_iov); 1144 &buf->rb_send_bufs[i]->rl_iov);
1132 if (rc) 1145 if (rc)
1133 goto out; 1146 goto out;
1134 1147
1135 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); 1148 buf->rb_send_bufs[i]->rl_size = wlen -
1149 sizeof(struct rpcrdma_req);
1136 1150
1137 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); 1151 rep = kmalloc(rlen, GFP_KERNEL);
1138 rep = kmalloc(len, GFP_KERNEL);
1139 if (rep == NULL) { 1152 if (rep == NULL) {
1140 dprintk("RPC: %s: reply buffer %d alloc failed\n", 1153 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1141 __func__, i); 1154 __func__, i);
@@ -1145,10 +1158,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1145 memset(rep, 0, sizeof(struct rpcrdma_rep)); 1158 memset(rep, 0, sizeof(struct rpcrdma_rep));
1146 buf->rb_recv_bufs[i] = rep; 1159 buf->rb_recv_bufs[i] = rep;
1147 buf->rb_recv_bufs[i]->rr_buffer = buf; 1160 buf->rb_recv_bufs[i]->rr_buffer = buf;
1148 init_waitqueue_head(&rep->rr_unbind);
1149 1161
1150 rc = rpcrdma_register_internal(ia, rep->rr_base, 1162 rc = rpcrdma_register_internal(ia, rep->rr_base,
1151 len - offsetof(struct rpcrdma_rep, rr_base), 1163 rlen - offsetof(struct rpcrdma_rep, rr_base),
1152 &buf->rb_recv_bufs[i]->rr_handle, 1164 &buf->rb_recv_bufs[i]->rr_handle,
1153 &buf->rb_recv_bufs[i]->rr_iov); 1165 &buf->rb_recv_bufs[i]->rr_iov);
1154 if (rc) 1166 if (rc)
@@ -1179,7 +1191,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1179 1191
1180 /* clean up in reverse order from create 1192 /* clean up in reverse order from create
1181 * 1. recv mr memory (mr free, then kfree) 1193 * 1. recv mr memory (mr free, then kfree)
1182 * 1a. bind mw memory
1183 * 2. send mr memory (mr free, then kfree) 1194 * 2. send mr memory (mr free, then kfree)
1184 * 3. padding (if any) [moved to rpcrdma_ep_destroy] 1195 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1185 * 4. arrays 1196 * 4. arrays
@@ -1194,41 +1205,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1194 kfree(buf->rb_recv_bufs[i]); 1205 kfree(buf->rb_recv_bufs[i]);
1195 } 1206 }
1196 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { 1207 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1197 while (!list_empty(&buf->rb_mws)) {
1198 r = list_entry(buf->rb_mws.next,
1199 struct rpcrdma_mw, mw_list);
1200 list_del(&r->mw_list);
1201 switch (ia->ri_memreg_strategy) {
1202 case RPCRDMA_FRMR:
1203 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1204 if (rc)
1205 dprintk("RPC: %s:"
1206 " ib_dereg_mr"
1207 " failed %i\n",
1208 __func__, rc);
1209 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1210 break;
1211 case RPCRDMA_MTHCAFMR:
1212 rc = ib_dealloc_fmr(r->r.fmr);
1213 if (rc)
1214 dprintk("RPC: %s:"
1215 " ib_dealloc_fmr"
1216 " failed %i\n",
1217 __func__, rc);
1218 break;
1219 case RPCRDMA_MEMWINDOWS_ASYNC:
1220 case RPCRDMA_MEMWINDOWS:
1221 rc = ib_dealloc_mw(r->r.mw);
1222 if (rc)
1223 dprintk("RPC: %s:"
1224 " ib_dealloc_mw"
1225 " failed %i\n",
1226 __func__, rc);
1227 break;
1228 default:
1229 break;
1230 }
1231 }
1232 rpcrdma_deregister_internal(ia, 1208 rpcrdma_deregister_internal(ia,
1233 buf->rb_send_bufs[i]->rl_handle, 1209 buf->rb_send_bufs[i]->rl_handle,
1234 &buf->rb_send_bufs[i]->rl_iov); 1210 &buf->rb_send_bufs[i]->rl_iov);
@@ -1236,6 +1212,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1236 } 1212 }
1237 } 1213 }
1238 1214
1215 while (!list_empty(&buf->rb_mws)) {
1216 r = list_entry(buf->rb_mws.next,
1217 struct rpcrdma_mw, mw_list);
1218 list_del(&r->mw_list);
1219 switch (ia->ri_memreg_strategy) {
1220 case RPCRDMA_FRMR:
1221 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1222 if (rc)
1223 dprintk("RPC: %s:"
1224 " ib_dereg_mr"
1225 " failed %i\n",
1226 __func__, rc);
1227 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1228 break;
1229 case RPCRDMA_MTHCAFMR:
1230 rc = ib_dealloc_fmr(r->r.fmr);
1231 if (rc)
1232 dprintk("RPC: %s:"
1233 " ib_dealloc_fmr"
1234 " failed %i\n",
1235 __func__, rc);
1236 break;
1237 default:
1238 break;
1239 }
1240 }
1241
1239 kfree(buf->rb_pool); 1242 kfree(buf->rb_pool);
1240} 1243}
1241 1244
@@ -1299,21 +1302,17 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
1299 int i; 1302 int i;
1300 unsigned long flags; 1303 unsigned long flags;
1301 1304
1302 BUG_ON(req->rl_nchunks != 0);
1303 spin_lock_irqsave(&buffers->rb_lock, flags); 1305 spin_lock_irqsave(&buffers->rb_lock, flags);
1304 buffers->rb_send_bufs[--buffers->rb_send_index] = req; 1306 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1305 req->rl_niovs = 0; 1307 req->rl_niovs = 0;
1306 if (req->rl_reply) { 1308 if (req->rl_reply) {
1307 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; 1309 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1308 init_waitqueue_head(&req->rl_reply->rr_unbind);
1309 req->rl_reply->rr_func = NULL; 1310 req->rl_reply->rr_func = NULL;
1310 req->rl_reply = NULL; 1311 req->rl_reply = NULL;
1311 } 1312 }
1312 switch (ia->ri_memreg_strategy) { 1313 switch (ia->ri_memreg_strategy) {
1313 case RPCRDMA_FRMR: 1314 case RPCRDMA_FRMR:
1314 case RPCRDMA_MTHCAFMR: 1315 case RPCRDMA_MTHCAFMR:
1315 case RPCRDMA_MEMWINDOWS_ASYNC:
1316 case RPCRDMA_MEMWINDOWS:
1317 /* 1316 /*
1318 * Cycle mw's back in reverse order, and "spin" them. 1317 * Cycle mw's back in reverse order, and "spin" them.
1319 * This delays and scrambles reuse as much as possible. 1318 * This delays and scrambles reuse as much as possible.
@@ -1358,8 +1357,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1358 1357
1359/* 1358/*
1360 * Put reply buffers back into pool when not attached to 1359 * Put reply buffers back into pool when not attached to
1361 * request. This happens in error conditions, and when 1360 * request. This happens in error conditions.
1362 * aborting unbinds. Pre-decrement counter/array index.
1363 */ 1361 */
1364void 1362void
1365rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) 1363rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
@@ -1498,8 +1496,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1498 seg1->mr_offset -= pageoff; /* start of page */ 1496 seg1->mr_offset -= pageoff; /* start of page */
1499 seg1->mr_len += pageoff; 1497 seg1->mr_len += pageoff;
1500 len = -pageoff; 1498 len = -pageoff;
1501 if (*nsegs > RPCRDMA_MAX_DATA_SEGS) 1499 if (*nsegs > ia->ri_max_frmr_depth)
1502 *nsegs = RPCRDMA_MAX_DATA_SEGS; 1500 *nsegs = ia->ri_max_frmr_depth;
1503 for (page_no = i = 0; i < *nsegs;) { 1501 for (page_no = i = 0; i < *nsegs;) {
1504 rpcrdma_map_one(ia, seg, writing); 1502 rpcrdma_map_one(ia, seg, writing);
1505 pa = seg->mr_dma; 1503 pa = seg->mr_dma;
@@ -1536,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1536 } else 1534 } else
1537 post_wr = &frmr_wr; 1535 post_wr = &frmr_wr;
1538 1536
1539 /* Bump the key */
1540 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1541 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1542
1543 /* Prepare FRMR WR */ 1537 /* Prepare FRMR WR */
1544 memset(&frmr_wr, 0, sizeof frmr_wr); 1538 memset(&frmr_wr, 0, sizeof frmr_wr);
1545 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1539 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
@@ -1550,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1550 frmr_wr.wr.fast_reg.page_list_len = page_no; 1544 frmr_wr.wr.fast_reg.page_list_len = page_no;
1551 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; 1545 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1552 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; 1546 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1553 BUG_ON(frmr_wr.wr.fast_reg.length < len); 1547 if (frmr_wr.wr.fast_reg.length < len) {
1548 while (seg1->mr_nsegs--)
1549 rpcrdma_unmap_one(ia, seg++);
1550 return -EIO;
1551 }
1552
1553 /* Bump the key */
1554 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1555 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1556
1554 frmr_wr.wr.fast_reg.access_flags = (writing ? 1557 frmr_wr.wr.fast_reg.access_flags = (writing ?
1555 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : 1558 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1556 IB_ACCESS_REMOTE_READ); 1559 IB_ACCESS_REMOTE_READ);
@@ -1661,135 +1664,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1661 return rc; 1664 return rc;
1662} 1665}
1663 1666
1664static int
1665rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1666 int *nsegs, int writing, struct rpcrdma_ia *ia,
1667 struct rpcrdma_xprt *r_xprt)
1668{
1669 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1670 IB_ACCESS_REMOTE_READ);
1671 struct ib_mw_bind param;
1672 int rc;
1673
1674 *nsegs = 1;
1675 rpcrdma_map_one(ia, seg, writing);
1676 param.bind_info.mr = ia->ri_bind_mem;
1677 param.wr_id = 0ULL; /* no send cookie */
1678 param.bind_info.addr = seg->mr_dma;
1679 param.bind_info.length = seg->mr_len;
1680 param.send_flags = 0;
1681 param.bind_info.mw_access_flags = mem_priv;
1682
1683 DECR_CQCOUNT(&r_xprt->rx_ep);
1684 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1685 if (rc) {
1686 dprintk("RPC: %s: failed ib_bind_mw "
1687 "%u@0x%llx status %i\n",
1688 __func__, seg->mr_len,
1689 (unsigned long long)seg->mr_dma, rc);
1690 rpcrdma_unmap_one(ia, seg);
1691 } else {
1692 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1693 seg->mr_base = param.bind_info.addr;
1694 seg->mr_nsegs = 1;
1695 }
1696 return rc;
1697}
1698
1699static int
1700rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1701 struct rpcrdma_ia *ia,
1702 struct rpcrdma_xprt *r_xprt, void **r)
1703{
1704 struct ib_mw_bind param;
1705 LIST_HEAD(l);
1706 int rc;
1707
1708 BUG_ON(seg->mr_nsegs != 1);
1709 param.bind_info.mr = ia->ri_bind_mem;
1710 param.bind_info.addr = 0ULL; /* unbind */
1711 param.bind_info.length = 0;
1712 param.bind_info.mw_access_flags = 0;
1713 if (*r) {
1714 param.wr_id = (u64) (unsigned long) *r;
1715 param.send_flags = IB_SEND_SIGNALED;
1716 INIT_CQCOUNT(&r_xprt->rx_ep);
1717 } else {
1718 param.wr_id = 0ULL;
1719 param.send_flags = 0;
1720 DECR_CQCOUNT(&r_xprt->rx_ep);
1721 }
1722 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1723 rpcrdma_unmap_one(ia, seg);
1724 if (rc)
1725 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1726 " status %i\n", __func__, rc);
1727 else
1728 *r = NULL; /* will upcall on completion */
1729 return rc;
1730}
1731
1732static int
1733rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1734 int *nsegs, int writing, struct rpcrdma_ia *ia)
1735{
1736 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1737 IB_ACCESS_REMOTE_READ);
1738 struct rpcrdma_mr_seg *seg1 = seg;
1739 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1740 int len, i, rc = 0;
1741
1742 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1743 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1744 for (len = 0, i = 0; i < *nsegs;) {
1745 rpcrdma_map_one(ia, seg, writing);
1746 ipb[i].addr = seg->mr_dma;
1747 ipb[i].size = seg->mr_len;
1748 len += seg->mr_len;
1749 ++seg;
1750 ++i;
1751 /* Check for holes */
1752 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1753 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1754 break;
1755 }
1756 seg1->mr_base = seg1->mr_dma;
1757 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1758 ipb, i, mem_priv, &seg1->mr_base);
1759 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1760 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1761 dprintk("RPC: %s: failed ib_reg_phys_mr "
1762 "%u@0x%llx (%d)... status %i\n",
1763 __func__, len,
1764 (unsigned long long)seg1->mr_dma, i, rc);
1765 while (i--)
1766 rpcrdma_unmap_one(ia, --seg);
1767 } else {
1768 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1769 seg1->mr_nsegs = i;
1770 seg1->mr_len = len;
1771 }
1772 *nsegs = i;
1773 return rc;
1774}
1775
1776static int
1777rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1778 struct rpcrdma_ia *ia)
1779{
1780 struct rpcrdma_mr_seg *seg1 = seg;
1781 int rc;
1782
1783 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1784 seg1->mr_chunk.rl_mr = NULL;
1785 while (seg1->mr_nsegs--)
1786 rpcrdma_unmap_one(ia, seg++);
1787 if (rc)
1788 dprintk("RPC: %s: failed ib_dereg_mr,"
1789 " status %i\n", __func__, rc);
1790 return rc;
1791}
1792
1793int 1667int
1794rpcrdma_register_external(struct rpcrdma_mr_seg *seg, 1668rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1795 int nsegs, int writing, struct rpcrdma_xprt *r_xprt) 1669 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
@@ -1819,16 +1693,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1819 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); 1693 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1820 break; 1694 break;
1821 1695
1822 /* Registration using memory windows */
1823 case RPCRDMA_MEMWINDOWS_ASYNC:
1824 case RPCRDMA_MEMWINDOWS:
1825 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1826 break;
1827
1828 /* Default registration each time */
1829 default: 1696 default:
1830 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); 1697 return -1;
1831 break;
1832 } 1698 }
1833 if (rc) 1699 if (rc)
1834 return -1; 1700 return -1;
@@ -1838,7 +1704,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1838 1704
1839int 1705int
1840rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, 1706rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1841 struct rpcrdma_xprt *r_xprt, void *r) 1707 struct rpcrdma_xprt *r_xprt)
1842{ 1708{
1843 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1709 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1844 int nsegs = seg->mr_nsegs, rc; 1710 int nsegs = seg->mr_nsegs, rc;
@@ -1847,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1847 1713
1848#if RPCRDMA_PERSISTENT_REGISTRATION 1714#if RPCRDMA_PERSISTENT_REGISTRATION
1849 case RPCRDMA_ALLPHYSICAL: 1715 case RPCRDMA_ALLPHYSICAL:
1850 BUG_ON(nsegs != 1);
1851 rpcrdma_unmap_one(ia, seg); 1716 rpcrdma_unmap_one(ia, seg);
1852 rc = 0;
1853 break; 1717 break;
1854#endif 1718#endif
1855 1719
@@ -1861,21 +1725,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1861 rc = rpcrdma_deregister_fmr_external(seg, ia); 1725 rc = rpcrdma_deregister_fmr_external(seg, ia);
1862 break; 1726 break;
1863 1727
1864 case RPCRDMA_MEMWINDOWS_ASYNC:
1865 case RPCRDMA_MEMWINDOWS:
1866 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1867 break;
1868
1869 default: 1728 default:
1870 rc = rpcrdma_deregister_default_external(seg, ia);
1871 break; 1729 break;
1872 } 1730 }
1873 if (r) {
1874 struct rpcrdma_rep *rep = r;
1875 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1876 rep->rr_func = NULL;
1877 func(rep); /* dereg done, callback now */
1878 }
1879 return nsegs; 1731 return nsegs;
1880} 1732}
1881 1733
@@ -1950,7 +1802,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1950 ib_dma_sync_single_for_cpu(ia->ri_id->device, 1802 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1951 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); 1803 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1952 1804
1953 DECR_CQCOUNT(ep);
1954 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1805 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1955 1806
1956 if (rc) 1807 if (rc)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index cc1445dc1d1a..89e7cd479705 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -43,6 +43,7 @@
43#include <linux/wait.h> /* wait_queue_head_t, etc */ 43#include <linux/wait.h> /* wait_queue_head_t, etc */
44#include <linux/spinlock.h> /* spinlock_t, etc */ 44#include <linux/spinlock.h> /* spinlock_t, etc */
45#include <linux/atomic.h> /* atomic_t, etc */ 45#include <linux/atomic.h> /* atomic_t, etc */
46#include <linux/workqueue.h> /* struct work_struct */
46 47
47#include <rdma/rdma_cm.h> /* RDMA connection api */ 48#include <rdma/rdma_cm.h> /* RDMA connection api */
48#include <rdma/ib_verbs.h> /* RDMA verbs api */ 49#include <rdma/ib_verbs.h> /* RDMA verbs api */
@@ -66,18 +67,21 @@ struct rpcrdma_ia {
66 struct completion ri_done; 67 struct completion ri_done;
67 int ri_async_rc; 68 int ri_async_rc;
68 enum rpcrdma_memreg ri_memreg_strategy; 69 enum rpcrdma_memreg ri_memreg_strategy;
70 unsigned int ri_max_frmr_depth;
69}; 71};
70 72
71/* 73/*
72 * RDMA Endpoint -- one per transport instance 74 * RDMA Endpoint -- one per transport instance
73 */ 75 */
74 76
77#define RPCRDMA_WC_BUDGET (128)
78#define RPCRDMA_POLLSIZE (16)
79
75struct rpcrdma_ep { 80struct rpcrdma_ep {
76 atomic_t rep_cqcount; 81 atomic_t rep_cqcount;
77 int rep_cqinit; 82 int rep_cqinit;
78 int rep_connected; 83 int rep_connected;
79 struct rpcrdma_ia *rep_ia; 84 struct rpcrdma_ia *rep_ia;
80 struct ib_cq *rep_cq;
81 struct ib_qp_init_attr rep_attr; 85 struct ib_qp_init_attr rep_attr;
82 wait_queue_head_t rep_connect_wait; 86 wait_queue_head_t rep_connect_wait;
83 struct ib_sge rep_pad; /* holds zeroed pad */ 87 struct ib_sge rep_pad; /* holds zeroed pad */
@@ -86,6 +90,9 @@ struct rpcrdma_ep {
86 struct rpc_xprt *rep_xprt; /* for rep_func */ 90 struct rpc_xprt *rep_xprt; /* for rep_func */
87 struct rdma_conn_param rep_remote_cma; 91 struct rdma_conn_param rep_remote_cma;
88 struct sockaddr_storage rep_remote_addr; 92 struct sockaddr_storage rep_remote_addr;
93 struct delayed_work rep_connect_worker;
94 struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
95 struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
89}; 96};
90 97
91#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 98#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
@@ -124,7 +131,6 @@ struct rpcrdma_rep {
124 struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ 131 struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
125 void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ 132 void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
126 struct list_head rr_list; /* tasklet list */ 133 struct list_head rr_list; /* tasklet list */
127 wait_queue_head_t rr_unbind; /* optional unbind wait */
128 struct ib_sge rr_iov; /* for posting */ 134 struct ib_sge rr_iov; /* for posting */
129 struct ib_mr *rr_handle; /* handle for mem in rr_iov */ 135 struct ib_mr *rr_handle; /* handle for mem in rr_iov */
130 char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ 136 char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
@@ -159,7 +165,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
159 struct ib_mr *rl_mr; /* if registered directly */ 165 struct ib_mr *rl_mr; /* if registered directly */
160 struct rpcrdma_mw { /* if registered from region */ 166 struct rpcrdma_mw { /* if registered from region */
161 union { 167 union {
162 struct ib_mw *mw;
163 struct ib_fmr *fmr; 168 struct ib_fmr *fmr;
164 struct { 169 struct {
165 struct ib_fast_reg_page_list *fr_pgl; 170 struct ib_fast_reg_page_list *fr_pgl;
@@ -207,7 +212,6 @@ struct rpcrdma_req {
207struct rpcrdma_buffer { 212struct rpcrdma_buffer {
208 spinlock_t rb_lock; /* protects indexes */ 213 spinlock_t rb_lock; /* protects indexes */
209 atomic_t rb_credits; /* most recent server credits */ 214 atomic_t rb_credits; /* most recent server credits */
210 unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
211 int rb_max_requests;/* client max requests */ 215 int rb_max_requests;/* client max requests */
212 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ 216 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
213 int rb_send_index; 217 int rb_send_index;
@@ -300,7 +304,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *);
300 */ 304 */
301int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, 305int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
302 struct rpcrdma_create_data_internal *); 306 struct rpcrdma_create_data_internal *);
303int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); 307void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
304int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); 308int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
305int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); 309int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
306 310
@@ -330,11 +334,12 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *,
330int rpcrdma_register_external(struct rpcrdma_mr_seg *, 334int rpcrdma_register_external(struct rpcrdma_mr_seg *,
331 int, int, struct rpcrdma_xprt *); 335 int, int, struct rpcrdma_xprt *);
332int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, 336int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
333 struct rpcrdma_xprt *, void *); 337 struct rpcrdma_xprt *);
334 338
335/* 339/*
336 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 340 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
337 */ 341 */
342void rpcrdma_connect_worker(struct work_struct *);
338void rpcrdma_conn_func(struct rpcrdma_ep *); 343void rpcrdma_conn_func(struct rpcrdma_ep *);
339void rpcrdma_reply_handler(struct rpcrdma_rep *); 344void rpcrdma_reply_handler(struct rpcrdma_rep *);
340 345