diff options
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 119 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 90 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 753 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 17 |
4 files changed, 396 insertions, 583 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead526b125..693966d3f33b 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c | |||
@@ -78,8 +78,7 @@ static const char transfertypes[][12] = { | |||
78 | * elements. Segments are then coalesced when registered, if possible | 78 | * elements. Segments are then coalesced when registered, if possible |
79 | * within the selected memreg mode. | 79 | * within the selected memreg mode. |
80 | * | 80 | * |
81 | * Note, this routine is never called if the connection's memory | 81 | * Returns positive number of segments converted, or a negative errno. |
82 | * registration strategy is 0 (bounce buffers). | ||
83 | */ | 82 | */ |
84 | 83 | ||
85 | static int | 84 | static int |
@@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
102 | page_base = xdrbuf->page_base & ~PAGE_MASK; | 101 | page_base = xdrbuf->page_base & ~PAGE_MASK; |
103 | p = 0; | 102 | p = 0; |
104 | while (len && n < nsegs) { | 103 | while (len && n < nsegs) { |
104 | if (!ppages[p]) { | ||
105 | /* alloc the pagelist for receiving buffer */ | ||
106 | ppages[p] = alloc_page(GFP_ATOMIC); | ||
107 | if (!ppages[p]) | ||
108 | return -ENOMEM; | ||
109 | } | ||
105 | seg[n].mr_page = ppages[p]; | 110 | seg[n].mr_page = ppages[p]; |
106 | seg[n].mr_offset = (void *)(unsigned long) page_base; | 111 | seg[n].mr_offset = (void *)(unsigned long) page_base; |
107 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); | 112 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); |
108 | BUG_ON(seg[n].mr_len > PAGE_SIZE); | 113 | if (seg[n].mr_len > PAGE_SIZE) |
114 | return -EIO; | ||
109 | len -= seg[n].mr_len; | 115 | len -= seg[n].mr_len; |
110 | ++n; | 116 | ++n; |
111 | ++p; | 117 | ++p; |
@@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
114 | 120 | ||
115 | /* Message overflows the seg array */ | 121 | /* Message overflows the seg array */ |
116 | if (len && n == nsegs) | 122 | if (len && n == nsegs) |
117 | return 0; | 123 | return -EIO; |
118 | 124 | ||
119 | if (xdrbuf->tail[0].iov_len) { | 125 | if (xdrbuf->tail[0].iov_len) { |
120 | /* the rpcrdma protocol allows us to omit any trailing | 126 | /* the rpcrdma protocol allows us to omit any trailing |
@@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
123 | return n; | 129 | return n; |
124 | if (n == nsegs) | 130 | if (n == nsegs) |
125 | /* Tail remains, but we're out of segments */ | 131 | /* Tail remains, but we're out of segments */ |
126 | return 0; | 132 | return -EIO; |
127 | seg[n].mr_page = NULL; | 133 | seg[n].mr_page = NULL; |
128 | seg[n].mr_offset = xdrbuf->tail[0].iov_base; | 134 | seg[n].mr_offset = xdrbuf->tail[0].iov_base; |
129 | seg[n].mr_len = xdrbuf->tail[0].iov_len; | 135 | seg[n].mr_len = xdrbuf->tail[0].iov_len; |
@@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
164 | * Reply chunk (a counted array): | 170 | * Reply chunk (a counted array): |
165 | * N elements: | 171 | * N elements: |
166 | * 1 - N - HLOO - HLOO - ... - HLOO | 172 | * 1 - N - HLOO - HLOO - ... - HLOO |
173 | * | ||
174 | * Returns positive RPC/RDMA header size, or negative errno. | ||
167 | */ | 175 | */ |
168 | 176 | ||
169 | static unsigned int | 177 | static ssize_t |
170 | rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, | 178 | rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, |
171 | struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) | 179 | struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) |
172 | { | 180 | { |
173 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); | 181 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); |
174 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); | 182 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); |
175 | int nsegs, nchunks = 0; | 183 | int n, nsegs, nchunks = 0; |
176 | unsigned int pos; | 184 | unsigned int pos; |
177 | struct rpcrdma_mr_seg *seg = req->rl_segments; | 185 | struct rpcrdma_mr_seg *seg = req->rl_segments; |
178 | struct rpcrdma_read_chunk *cur_rchunk = NULL; | 186 | struct rpcrdma_read_chunk *cur_rchunk = NULL; |
@@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, | |||
198 | pos = target->head[0].iov_len; | 206 | pos = target->head[0].iov_len; |
199 | 207 | ||
200 | nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); | 208 | nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); |
201 | if (nsegs == 0) | 209 | if (nsegs < 0) |
202 | return 0; | 210 | return nsegs; |
203 | 211 | ||
204 | do { | 212 | do { |
205 | /* bind/register the memory, then build chunk from result. */ | 213 | n = rpcrdma_register_external(seg, nsegs, |
206 | int n = rpcrdma_register_external(seg, nsegs, | ||
207 | cur_wchunk != NULL, r_xprt); | 214 | cur_wchunk != NULL, r_xprt); |
208 | if (n <= 0) | 215 | if (n <= 0) |
209 | goto out; | 216 | goto out; |
@@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, | |||
248 | /* success. all failures return above */ | 255 | /* success. all failures return above */ |
249 | req->rl_nchunks = nchunks; | 256 | req->rl_nchunks = nchunks; |
250 | 257 | ||
251 | BUG_ON(nchunks == 0); | ||
252 | BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) | ||
253 | && (nchunks > 3)); | ||
254 | |||
255 | /* | 258 | /* |
256 | * finish off header. If write, marshal discrim and nchunks. | 259 | * finish off header. If write, marshal discrim and nchunks. |
257 | */ | 260 | */ |
@@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, | |||
278 | out: | 281 | out: |
279 | for (pos = 0; nchunks--;) | 282 | for (pos = 0; nchunks--;) |
280 | pos += rpcrdma_deregister_external( | 283 | pos += rpcrdma_deregister_external( |
281 | &req->rl_segments[pos], r_xprt, NULL); | 284 | &req->rl_segments[pos], r_xprt); |
282 | return 0; | 285 | return n; |
283 | } | 286 | } |
284 | 287 | ||
285 | /* | 288 | /* |
@@ -361,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) | |||
361 | * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. | 364 | * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. |
362 | * [2] -- optional padding. | 365 | * [2] -- optional padding. |
363 | * [3] -- if padded, header only in [1] and data here. | 366 | * [3] -- if padded, header only in [1] and data here. |
367 | * | ||
368 | * Returns zero on success, otherwise a negative errno. | ||
364 | */ | 369 | */ |
365 | 370 | ||
366 | int | 371 | int |
@@ -370,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
370 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | 375 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
371 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); | 376 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); |
372 | char *base; | 377 | char *base; |
373 | size_t hdrlen, rpclen, padlen; | 378 | size_t rpclen, padlen; |
379 | ssize_t hdrlen; | ||
374 | enum rpcrdma_chunktype rtype, wtype; | 380 | enum rpcrdma_chunktype rtype, wtype; |
375 | struct rpcrdma_msg *headerp; | 381 | struct rpcrdma_msg *headerp; |
376 | 382 | ||
@@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
441 | /* The following simplification is not true forever */ | 447 | /* The following simplification is not true forever */ |
442 | if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) | 448 | if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) |
443 | wtype = rpcrdma_noch; | 449 | wtype = rpcrdma_noch; |
444 | BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); | 450 | if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { |
445 | 451 | dprintk("RPC: %s: cannot marshal multiple chunk lists\n", | |
446 | if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && | 452 | __func__); |
447 | (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { | 453 | return -EIO; |
448 | /* forced to "pure inline"? */ | ||
449 | dprintk("RPC: %s: too much data (%d/%d) for inline\n", | ||
450 | __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); | ||
451 | return -1; | ||
452 | } | 454 | } |
453 | 455 | ||
454 | hdrlen = 28; /*sizeof *headerp;*/ | 456 | hdrlen = 28; /*sizeof *headerp;*/ |
@@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
474 | headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; | 476 | headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; |
475 | headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; | 477 | headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; |
476 | hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ | 478 | hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ |
477 | BUG_ON(wtype != rpcrdma_noch); | 479 | if (wtype != rpcrdma_noch) { |
478 | 480 | dprintk("RPC: %s: invalid chunk list\n", | |
481 | __func__); | ||
482 | return -EIO; | ||
483 | } | ||
479 | } else { | 484 | } else { |
480 | headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; | 485 | headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; |
481 | headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; | 486 | headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; |
@@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
492 | * on receive. Therefore, we request a reply chunk | 497 | * on receive. Therefore, we request a reply chunk |
493 | * for non-writes wherever feasible and efficient. | 498 | * for non-writes wherever feasible and efficient. |
494 | */ | 499 | */ |
495 | if (wtype == rpcrdma_noch && | 500 | if (wtype == rpcrdma_noch) |
496 | r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) | ||
497 | wtype = rpcrdma_replych; | 501 | wtype = rpcrdma_replych; |
498 | } | 502 | } |
499 | } | 503 | } |
@@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
511 | hdrlen = rpcrdma_create_chunks(rqst, | 515 | hdrlen = rpcrdma_create_chunks(rqst, |
512 | &rqst->rq_rcv_buf, headerp, wtype); | 516 | &rqst->rq_rcv_buf, headerp, wtype); |
513 | } | 517 | } |
514 | 518 | if (hdrlen < 0) | |
515 | if (hdrlen == 0) | 519 | return hdrlen; |
516 | return -1; | ||
517 | 520 | ||
518 | dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" | 521 | dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" |
519 | " headerp 0x%p base 0x%p lkey 0x%x\n", | 522 | " headerp 0x%p base 0x%p lkey 0x%x\n", |
@@ -680,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) | |||
680 | rqst->rq_private_buf = rqst->rq_rcv_buf; | 683 | rqst->rq_private_buf = rqst->rq_rcv_buf; |
681 | } | 684 | } |
682 | 685 | ||
683 | /* | ||
684 | * This function is called when an async event is posted to | ||
685 | * the connection which changes the connection state. All it | ||
686 | * does at this point is mark the connection up/down, the rpc | ||
687 | * timers do the rest. | ||
688 | */ | ||
689 | void | 686 | void |
690 | rpcrdma_conn_func(struct rpcrdma_ep *ep) | 687 | rpcrdma_connect_worker(struct work_struct *work) |
691 | { | 688 | { |
689 | struct rpcrdma_ep *ep = | ||
690 | container_of(work, struct rpcrdma_ep, rep_connect_worker.work); | ||
692 | struct rpc_xprt *xprt = ep->rep_xprt; | 691 | struct rpc_xprt *xprt = ep->rep_xprt; |
693 | 692 | ||
694 | spin_lock_bh(&xprt->transport_lock); | 693 | spin_lock_bh(&xprt->transport_lock); |
@@ -705,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) | |||
705 | } | 704 | } |
706 | 705 | ||
707 | /* | 706 | /* |
708 | * This function is called when memory window unbind which we are waiting | 707 | * This function is called when an async event is posted to |
709 | * for completes. Just use rr_func (zeroed by upcall) to signal completion. | 708 | * the connection which changes the connection state. All it |
709 | * does at this point is mark the connection up/down, the rpc | ||
710 | * timers do the rest. | ||
710 | */ | 711 | */ |
711 | static void | 712 | void |
712 | rpcrdma_unbind_func(struct rpcrdma_rep *rep) | 713 | rpcrdma_conn_func(struct rpcrdma_ep *ep) |
713 | { | 714 | { |
714 | wake_up(&rep->rr_unbind); | 715 | schedule_delayed_work(&ep->rep_connect_worker, 0); |
715 | } | 716 | } |
716 | 717 | ||
717 | /* | 718 | /* |
@@ -728,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
728 | struct rpc_xprt *xprt = rep->rr_xprt; | 729 | struct rpc_xprt *xprt = rep->rr_xprt; |
729 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | 730 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
730 | __be32 *iptr; | 731 | __be32 *iptr; |
731 | int i, rdmalen, status; | 732 | int rdmalen, status; |
733 | unsigned long cwnd; | ||
732 | 734 | ||
733 | /* Check status. If bad, signal disconnect and return rep to pool */ | 735 | /* Check status. If bad, signal disconnect and return rep to pool */ |
734 | if (rep->rr_len == ~0U) { | 736 | if (rep->rr_len == ~0U) { |
@@ -783,6 +785,7 @@ repost: | |||
783 | 785 | ||
784 | /* from here on, the reply is no longer an orphan */ | 786 | /* from here on, the reply is no longer an orphan */ |
785 | req->rl_reply = rep; | 787 | req->rl_reply = rep; |
788 | xprt->reestablish_timeout = 0; | ||
786 | 789 | ||
787 | /* check for expected message types */ | 790 | /* check for expected message types */ |
788 | /* The order of some of these tests is important. */ | 791 | /* The order of some of these tests is important. */ |
@@ -857,26 +860,10 @@ badheader: | |||
857 | break; | 860 | break; |
858 | } | 861 | } |
859 | 862 | ||
860 | /* If using mw bind, start the deregister process now. */ | 863 | cwnd = xprt->cwnd; |
861 | /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ | 864 | xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; |
862 | if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { | 865 | if (xprt->cwnd > cwnd) |
863 | case RPCRDMA_MEMWINDOWS: | 866 | xprt_release_rqst_cong(rqst->rq_task); |
864 | for (i = 0; req->rl_nchunks-- > 1;) | ||
865 | i += rpcrdma_deregister_external( | ||
866 | &req->rl_segments[i], r_xprt, NULL); | ||
867 | /* Optionally wait (not here) for unbinds to complete */ | ||
868 | rep->rr_func = rpcrdma_unbind_func; | ||
869 | (void) rpcrdma_deregister_external(&req->rl_segments[i], | ||
870 | r_xprt, rep); | ||
871 | break; | ||
872 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
873 | for (i = 0; req->rl_nchunks--;) | ||
874 | i += rpcrdma_deregister_external(&req->rl_segments[i], | ||
875 | r_xprt, NULL); | ||
876 | break; | ||
877 | default: | ||
878 | break; | ||
879 | } | ||
880 | 867 | ||
881 | dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", | 868 | dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", |
882 | __func__, xprt, rqst, status); | 869 | __func__, xprt, rqst, status); |
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 1eb9c468d0c9..66f91f0d071a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c | |||
@@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = { | |||
149 | 149 | ||
150 | #endif | 150 | #endif |
151 | 151 | ||
152 | #define RPCRDMA_BIND_TO (60U * HZ) | ||
153 | #define RPCRDMA_INIT_REEST_TO (5U * HZ) | ||
154 | #define RPCRDMA_MAX_REEST_TO (30U * HZ) | ||
155 | #define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) | ||
156 | |||
152 | static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ | 157 | static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ |
153 | 158 | ||
154 | static void | 159 | static void |
@@ -229,7 +234,6 @@ static void | |||
229 | xprt_rdma_destroy(struct rpc_xprt *xprt) | 234 | xprt_rdma_destroy(struct rpc_xprt *xprt) |
230 | { | 235 | { |
231 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | 236 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
232 | int rc; | ||
233 | 237 | ||
234 | dprintk("RPC: %s: called\n", __func__); | 238 | dprintk("RPC: %s: called\n", __func__); |
235 | 239 | ||
@@ -238,10 +242,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) | |||
238 | xprt_clear_connected(xprt); | 242 | xprt_clear_connected(xprt); |
239 | 243 | ||
240 | rpcrdma_buffer_destroy(&r_xprt->rx_buf); | 244 | rpcrdma_buffer_destroy(&r_xprt->rx_buf); |
241 | rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); | 245 | rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); |
242 | if (rc) | ||
243 | dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", | ||
244 | __func__, rc); | ||
245 | rpcrdma_ia_close(&r_xprt->rx_ia); | 246 | rpcrdma_ia_close(&r_xprt->rx_ia); |
246 | 247 | ||
247 | xprt_rdma_free_addresses(xprt); | 248 | xprt_rdma_free_addresses(xprt); |
@@ -289,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args) | |||
289 | 290 | ||
290 | /* 60 second timeout, no retries */ | 291 | /* 60 second timeout, no retries */ |
291 | xprt->timeout = &xprt_rdma_default_timeout; | 292 | xprt->timeout = &xprt_rdma_default_timeout; |
292 | xprt->bind_timeout = (60U * HZ); | 293 | xprt->bind_timeout = RPCRDMA_BIND_TO; |
293 | xprt->reestablish_timeout = (5U * HZ); | 294 | xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; |
294 | xprt->idle_timeout = (5U * 60 * HZ); | 295 | xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; |
295 | 296 | ||
296 | xprt->resvport = 0; /* privileged port not needed */ | 297 | xprt->resvport = 0; /* privileged port not needed */ |
297 | xprt->tsh_size = 0; /* RPC-RDMA handles framing */ | 298 | xprt->tsh_size = 0; /* RPC-RDMA handles framing */ |
@@ -391,7 +392,7 @@ out4: | |||
391 | xprt_rdma_free_addresses(xprt); | 392 | xprt_rdma_free_addresses(xprt); |
392 | rc = -EINVAL; | 393 | rc = -EINVAL; |
393 | out3: | 394 | out3: |
394 | (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); | 395 | rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); |
395 | out2: | 396 | out2: |
396 | rpcrdma_ia_close(&new_xprt->rx_ia); | 397 | rpcrdma_ia_close(&new_xprt->rx_ia); |
397 | out1: | 398 | out1: |
@@ -436,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) | |||
436 | schedule_delayed_work(&r_xprt->rdma_connect, | 437 | schedule_delayed_work(&r_xprt->rdma_connect, |
437 | xprt->reestablish_timeout); | 438 | xprt->reestablish_timeout); |
438 | xprt->reestablish_timeout <<= 1; | 439 | xprt->reestablish_timeout <<= 1; |
439 | if (xprt->reestablish_timeout > (30 * HZ)) | 440 | if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) |
440 | xprt->reestablish_timeout = (30 * HZ); | 441 | xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; |
441 | else if (xprt->reestablish_timeout < (5 * HZ)) | 442 | else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) |
442 | xprt->reestablish_timeout = (5 * HZ); | 443 | xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; |
443 | } else { | 444 | } else { |
444 | schedule_delayed_work(&r_xprt->rdma_connect, 0); | 445 | schedule_delayed_work(&r_xprt->rdma_connect, 0); |
445 | if (!RPC_IS_ASYNC(task)) | 446 | if (!RPC_IS_ASYNC(task)) |
@@ -447,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) | |||
447 | } | 448 | } |
448 | } | 449 | } |
449 | 450 | ||
450 | static int | ||
451 | xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) | ||
452 | { | ||
453 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | ||
454 | int credits = atomic_read(&r_xprt->rx_buf.rb_credits); | ||
455 | |||
456 | /* == RPC_CWNDSCALE @ init, but *after* setup */ | ||
457 | if (r_xprt->rx_buf.rb_cwndscale == 0UL) { | ||
458 | r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; | ||
459 | dprintk("RPC: %s: cwndscale %lu\n", __func__, | ||
460 | r_xprt->rx_buf.rb_cwndscale); | ||
461 | BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); | ||
462 | } | ||
463 | xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; | ||
464 | return xprt_reserve_xprt_cong(xprt, task); | ||
465 | } | ||
466 | |||
467 | /* | 451 | /* |
468 | * The RDMA allocate/free functions need the task structure as a place | 452 | * The RDMA allocate/free functions need the task structure as a place |
469 | * to hide the struct rpcrdma_req, which is necessary for the actual send/recv | 453 | * to hide the struct rpcrdma_req, which is necessary for the actual send/recv |
@@ -479,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) | |||
479 | struct rpcrdma_req *req, *nreq; | 463 | struct rpcrdma_req *req, *nreq; |
480 | 464 | ||
481 | req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); | 465 | req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); |
482 | BUG_ON(NULL == req); | 466 | if (req == NULL) |
467 | return NULL; | ||
483 | 468 | ||
484 | if (size > req->rl_size) { | 469 | if (size > req->rl_size) { |
485 | dprintk("RPC: %s: size %zd too large for buffer[%zd]: " | 470 | dprintk("RPC: %s: size %zd too large for buffer[%zd]: " |
@@ -503,18 +488,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) | |||
503 | * If the allocation or registration fails, the RPC framework | 488 | * If the allocation or registration fails, the RPC framework |
504 | * will (doggedly) retry. | 489 | * will (doggedly) retry. |
505 | */ | 490 | */ |
506 | if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == | ||
507 | RPCRDMA_BOUNCEBUFFERS) { | ||
508 | /* forced to "pure inline" */ | ||
509 | dprintk("RPC: %s: too much data (%zd) for inline " | ||
510 | "(r/w max %d/%d)\n", __func__, size, | ||
511 | rpcx_to_rdmad(xprt).inline_rsize, | ||
512 | rpcx_to_rdmad(xprt).inline_wsize); | ||
513 | size = req->rl_size; | ||
514 | rpc_exit(task, -EIO); /* fail the operation */ | ||
515 | rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; | ||
516 | goto out; | ||
517 | } | ||
518 | if (task->tk_flags & RPC_TASK_SWAPPER) | 491 | if (task->tk_flags & RPC_TASK_SWAPPER) |
519 | nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); | 492 | nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); |
520 | else | 493 | else |
@@ -543,7 +516,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) | |||
543 | req = nreq; | 516 | req = nreq; |
544 | } | 517 | } |
545 | dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); | 518 | dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); |
546 | out: | ||
547 | req->rl_connect_cookie = 0; /* our reserved value */ | 519 | req->rl_connect_cookie = 0; /* our reserved value */ |
548 | return req->rl_xdr_buf; | 520 | return req->rl_xdr_buf; |
549 | 521 | ||
@@ -579,9 +551,7 @@ xprt_rdma_free(void *buffer) | |||
579 | __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); | 551 | __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); |
580 | 552 | ||
581 | /* | 553 | /* |
582 | * Finish the deregistration. When using mw bind, this was | 554 | * Finish the deregistration. The process is considered |
583 | * begun in rpcrdma_reply_handler(). In all other modes, we | ||
584 | * do it here, in thread context. The process is considered | ||
585 | * complete when the rr_func vector becomes NULL - this | 555 | * complete when the rr_func vector becomes NULL - this |
586 | * was put in place during rpcrdma_reply_handler() - the wait | 556 | * was put in place during rpcrdma_reply_handler() - the wait |
587 | * call below will not block if the dereg is "done". If | 557 | * call below will not block if the dereg is "done". If |
@@ -590,12 +560,7 @@ xprt_rdma_free(void *buffer) | |||
590 | for (i = 0; req->rl_nchunks;) { | 560 | for (i = 0; req->rl_nchunks;) { |
591 | --req->rl_nchunks; | 561 | --req->rl_nchunks; |
592 | i += rpcrdma_deregister_external( | 562 | i += rpcrdma_deregister_external( |
593 | &req->rl_segments[i], r_xprt, NULL); | 563 | &req->rl_segments[i], r_xprt); |
594 | } | ||
595 | |||
596 | if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { | ||
597 | rep->rr_func = NULL; /* abandon the callback */ | ||
598 | req->rl_reply = NULL; | ||
599 | } | 564 | } |
600 | 565 | ||
601 | if (req->rl_iov.length == 0) { /* see allocate above */ | 566 | if (req->rl_iov.length == 0) { /* see allocate above */ |
@@ -630,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task) | |||
630 | struct rpc_xprt *xprt = rqst->rq_xprt; | 595 | struct rpc_xprt *xprt = rqst->rq_xprt; |
631 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); | 596 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); |
632 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | 597 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
598 | int rc; | ||
633 | 599 | ||
634 | /* marshal the send itself */ | 600 | if (req->rl_niovs == 0) { |
635 | if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { | 601 | rc = rpcrdma_marshal_req(rqst); |
636 | r_xprt->rx_stats.failed_marshal_count++; | 602 | if (rc < 0) |
637 | dprintk("RPC: %s: rpcrdma_marshal_req failed\n", | 603 | goto failed_marshal; |
638 | __func__); | ||
639 | return -EIO; | ||
640 | } | 604 | } |
641 | 605 | ||
642 | if (req->rl_reply == NULL) /* e.g. reconnection */ | 606 | if (req->rl_reply == NULL) /* e.g. reconnection */ |
@@ -660,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task) | |||
660 | rqst->rq_bytes_sent = 0; | 624 | rqst->rq_bytes_sent = 0; |
661 | return 0; | 625 | return 0; |
662 | 626 | ||
627 | failed_marshal: | ||
628 | r_xprt->rx_stats.failed_marshal_count++; | ||
629 | dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", | ||
630 | __func__, rc); | ||
631 | if (rc == -EIO) | ||
632 | return -EIO; | ||
663 | drop_connection: | 633 | drop_connection: |
664 | xprt_disconnect_done(xprt); | 634 | xprt_disconnect_done(xprt); |
665 | return -ENOTCONN; /* implies disconnect */ | 635 | return -ENOTCONN; /* implies disconnect */ |
@@ -705,7 +675,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) | |||
705 | */ | 675 | */ |
706 | 676 | ||
707 | static struct rpc_xprt_ops xprt_rdma_procs = { | 677 | static struct rpc_xprt_ops xprt_rdma_procs = { |
708 | .reserve_xprt = xprt_rdma_reserve_xprt, | 678 | .reserve_xprt = xprt_reserve_xprt_cong, |
709 | .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ | 679 | .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ |
710 | .alloc_slot = xprt_alloc_slot, | 680 | .alloc_slot = xprt_alloc_slot, |
711 | .release_request = xprt_release_rqst_cong, /* ditto */ | 681 | .release_request = xprt_release_rqst_cong, /* ditto */ |
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 93726560eaa8..13dbd1c389ff 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c | |||
@@ -48,8 +48,8 @@ | |||
48 | */ | 48 | */ |
49 | 49 | ||
50 | #include <linux/interrupt.h> | 50 | #include <linux/interrupt.h> |
51 | #include <linux/pci.h> /* for Tavor hack below */ | ||
52 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
52 | #include <asm/bitops.h> | ||
53 | 53 | ||
54 | #include "xprt_rdma.h" | 54 | #include "xprt_rdma.h" |
55 | 55 | ||
@@ -142,98 +142,139 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) | |||
142 | } | 142 | } |
143 | } | 143 | } |
144 | 144 | ||
145 | static inline | 145 | static void |
146 | void rpcrdma_event_process(struct ib_wc *wc) | 146 | rpcrdma_sendcq_process_wc(struct ib_wc *wc) |
147 | { | 147 | { |
148 | struct rpcrdma_mw *frmr; | 148 | struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; |
149 | struct rpcrdma_rep *rep = | ||
150 | (struct rpcrdma_rep *)(unsigned long) wc->wr_id; | ||
151 | 149 | ||
152 | dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", | 150 | dprintk("RPC: %s: frmr %p status %X opcode %d\n", |
153 | __func__, rep, wc->status, wc->opcode, wc->byte_len); | 151 | __func__, frmr, wc->status, wc->opcode); |
154 | 152 | ||
155 | if (!rep) /* send or bind completion that we don't care about */ | 153 | if (wc->wr_id == 0ULL) |
156 | return; | 154 | return; |
157 | 155 | if (wc->status != IB_WC_SUCCESS) | |
158 | if (IB_WC_SUCCESS != wc->status) { | ||
159 | dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", | ||
160 | __func__, wc->opcode, wc->status); | ||
161 | rep->rr_len = ~0U; | ||
162 | if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) | ||
163 | rpcrdma_schedule_tasklet(rep); | ||
164 | return; | 156 | return; |
165 | } | ||
166 | 157 | ||
167 | switch (wc->opcode) { | 158 | if (wc->opcode == IB_WC_FAST_REG_MR) |
168 | case IB_WC_FAST_REG_MR: | ||
169 | frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; | ||
170 | frmr->r.frmr.state = FRMR_IS_VALID; | 159 | frmr->r.frmr.state = FRMR_IS_VALID; |
171 | break; | 160 | else if (wc->opcode == IB_WC_LOCAL_INV) |
172 | case IB_WC_LOCAL_INV: | ||
173 | frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; | ||
174 | frmr->r.frmr.state = FRMR_IS_INVALID; | 161 | frmr->r.frmr.state = FRMR_IS_INVALID; |
175 | break; | ||
176 | case IB_WC_RECV: | ||
177 | rep->rr_len = wc->byte_len; | ||
178 | ib_dma_sync_single_for_cpu( | ||
179 | rdmab_to_ia(rep->rr_buffer)->ri_id->device, | ||
180 | rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); | ||
181 | /* Keep (only) the most recent credits, after check validity */ | ||
182 | if (rep->rr_len >= 16) { | ||
183 | struct rpcrdma_msg *p = | ||
184 | (struct rpcrdma_msg *) rep->rr_base; | ||
185 | unsigned int credits = ntohl(p->rm_credit); | ||
186 | if (credits == 0) { | ||
187 | dprintk("RPC: %s: server" | ||
188 | " dropped credits to 0!\n", __func__); | ||
189 | /* don't deadlock */ | ||
190 | credits = 1; | ||
191 | } else if (credits > rep->rr_buffer->rb_max_requests) { | ||
192 | dprintk("RPC: %s: server" | ||
193 | " over-crediting: %d (%d)\n", | ||
194 | __func__, credits, | ||
195 | rep->rr_buffer->rb_max_requests); | ||
196 | credits = rep->rr_buffer->rb_max_requests; | ||
197 | } | ||
198 | atomic_set(&rep->rr_buffer->rb_credits, credits); | ||
199 | } | ||
200 | /* fall through */ | ||
201 | case IB_WC_BIND_MW: | ||
202 | rpcrdma_schedule_tasklet(rep); | ||
203 | break; | ||
204 | default: | ||
205 | dprintk("RPC: %s: unexpected WC event %X\n", | ||
206 | __func__, wc->opcode); | ||
207 | break; | ||
208 | } | ||
209 | } | 162 | } |
210 | 163 | ||
211 | static inline int | 164 | static int |
212 | rpcrdma_cq_poll(struct ib_cq *cq) | 165 | rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) |
213 | { | 166 | { |
214 | struct ib_wc wc; | 167 | struct ib_wc *wcs; |
215 | int rc; | 168 | int budget, count, rc; |
216 | 169 | ||
217 | for (;;) { | 170 | budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; |
218 | rc = ib_poll_cq(cq, 1, &wc); | 171 | do { |
219 | if (rc < 0) { | 172 | wcs = ep->rep_send_wcs; |
220 | dprintk("RPC: %s: ib_poll_cq failed %i\n", | 173 | |
221 | __func__, rc); | 174 | rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); |
175 | if (rc <= 0) | ||
222 | return rc; | 176 | return rc; |
223 | } | ||
224 | if (rc == 0) | ||
225 | break; | ||
226 | 177 | ||
227 | rpcrdma_event_process(&wc); | 178 | count = rc; |
179 | while (count-- > 0) | ||
180 | rpcrdma_sendcq_process_wc(wcs++); | ||
181 | } while (rc == RPCRDMA_POLLSIZE && --budget); | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * Handle send, fast_reg_mr, and local_inv completions. | ||
187 | * | ||
188 | * Send events are typically suppressed and thus do not result | ||
189 | * in an upcall. Occasionally one is signaled, however. This | ||
190 | * prevents the provider's completion queue from wrapping and | ||
191 | * losing a completion. | ||
192 | */ | ||
193 | static void | ||
194 | rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) | ||
195 | { | ||
196 | struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; | ||
197 | int rc; | ||
198 | |||
199 | rc = rpcrdma_sendcq_poll(cq, ep); | ||
200 | if (rc) { | ||
201 | dprintk("RPC: %s: ib_poll_cq failed: %i\n", | ||
202 | __func__, rc); | ||
203 | return; | ||
228 | } | 204 | } |
229 | 205 | ||
206 | rc = ib_req_notify_cq(cq, | ||
207 | IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); | ||
208 | if (rc == 0) | ||
209 | return; | ||
210 | if (rc < 0) { | ||
211 | dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", | ||
212 | __func__, rc); | ||
213 | return; | ||
214 | } | ||
215 | |||
216 | rpcrdma_sendcq_poll(cq, ep); | ||
217 | } | ||
218 | |||
219 | static void | ||
220 | rpcrdma_recvcq_process_wc(struct ib_wc *wc) | ||
221 | { | ||
222 | struct rpcrdma_rep *rep = | ||
223 | (struct rpcrdma_rep *)(unsigned long)wc->wr_id; | ||
224 | |||
225 | dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", | ||
226 | __func__, rep, wc->status, wc->opcode, wc->byte_len); | ||
227 | |||
228 | if (wc->status != IB_WC_SUCCESS) { | ||
229 | rep->rr_len = ~0U; | ||
230 | goto out_schedule; | ||
231 | } | ||
232 | if (wc->opcode != IB_WC_RECV) | ||
233 | return; | ||
234 | |||
235 | rep->rr_len = wc->byte_len; | ||
236 | ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, | ||
237 | rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); | ||
238 | |||
239 | if (rep->rr_len >= 16) { | ||
240 | struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; | ||
241 | unsigned int credits = ntohl(p->rm_credit); | ||
242 | |||
243 | if (credits == 0) | ||
244 | credits = 1; /* don't deadlock */ | ||
245 | else if (credits > rep->rr_buffer->rb_max_requests) | ||
246 | credits = rep->rr_buffer->rb_max_requests; | ||
247 | atomic_set(&rep->rr_buffer->rb_credits, credits); | ||
248 | } | ||
249 | |||
250 | out_schedule: | ||
251 | rpcrdma_schedule_tasklet(rep); | ||
252 | } | ||
253 | |||
254 | static int | ||
255 | rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) | ||
256 | { | ||
257 | struct ib_wc *wcs; | ||
258 | int budget, count, rc; | ||
259 | |||
260 | budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; | ||
261 | do { | ||
262 | wcs = ep->rep_recv_wcs; | ||
263 | |||
264 | rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); | ||
265 | if (rc <= 0) | ||
266 | return rc; | ||
267 | |||
268 | count = rc; | ||
269 | while (count-- > 0) | ||
270 | rpcrdma_recvcq_process_wc(wcs++); | ||
271 | } while (rc == RPCRDMA_POLLSIZE && --budget); | ||
230 | return 0; | 272 | return 0; |
231 | } | 273 | } |
232 | 274 | ||
233 | /* | 275 | /* |
234 | * rpcrdma_cq_event_upcall | 276 | * Handle receive completions. |
235 | * | 277 | * |
236 | * This upcall handles recv, send, bind and unbind events. | ||
237 | * It is reentrant but processes single events in order to maintain | 278 | * It is reentrant but processes single events in order to maintain |
238 | * ordering of receives to keep server credits. | 279 | * ordering of receives to keep server credits. |
239 | * | 280 | * |
@@ -242,26 +283,31 @@ rpcrdma_cq_poll(struct ib_cq *cq) | |||
242 | * connection shutdown. That is, the structures required for | 283 | * connection shutdown. That is, the structures required for |
243 | * the completion of the reply handler must remain intact until | 284 | * the completion of the reply handler must remain intact until |
244 | * all memory has been reclaimed. | 285 | * all memory has been reclaimed. |
245 | * | ||
246 | * Note that send events are suppressed and do not result in an upcall. | ||
247 | */ | 286 | */ |
248 | static void | 287 | static void |
249 | rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) | 288 | rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) |
250 | { | 289 | { |
290 | struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; | ||
251 | int rc; | 291 | int rc; |
252 | 292 | ||
253 | rc = rpcrdma_cq_poll(cq); | 293 | rc = rpcrdma_recvcq_poll(cq, ep); |
254 | if (rc) | 294 | if (rc) { |
295 | dprintk("RPC: %s: ib_poll_cq failed: %i\n", | ||
296 | __func__, rc); | ||
255 | return; | 297 | return; |
298 | } | ||
256 | 299 | ||
257 | rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); | 300 | rc = ib_req_notify_cq(cq, |
258 | if (rc) { | 301 | IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); |
259 | dprintk("RPC: %s: ib_req_notify_cq failed %i\n", | 302 | if (rc == 0) |
303 | return; | ||
304 | if (rc < 0) { | ||
305 | dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", | ||
260 | __func__, rc); | 306 | __func__, rc); |
261 | return; | 307 | return; |
262 | } | 308 | } |
263 | 309 | ||
264 | rpcrdma_cq_poll(cq); | 310 | rpcrdma_recvcq_poll(cq, ep); |
265 | } | 311 | } |
266 | 312 | ||
267 | #ifdef RPC_DEBUG | 313 | #ifdef RPC_DEBUG |
@@ -493,54 +539,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) | |||
493 | ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; | 539 | ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; |
494 | } | 540 | } |
495 | 541 | ||
496 | switch (memreg) { | 542 | if (memreg == RPCRDMA_FRMR) { |
497 | case RPCRDMA_MEMWINDOWS: | ||
498 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
499 | if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { | ||
500 | dprintk("RPC: %s: MEMWINDOWS registration " | ||
501 | "specified but not supported by adapter, " | ||
502 | "using slower RPCRDMA_REGISTER\n", | ||
503 | __func__); | ||
504 | memreg = RPCRDMA_REGISTER; | ||
505 | } | ||
506 | break; | ||
507 | case RPCRDMA_MTHCAFMR: | ||
508 | if (!ia->ri_id->device->alloc_fmr) { | ||
509 | #if RPCRDMA_PERSISTENT_REGISTRATION | ||
510 | dprintk("RPC: %s: MTHCAFMR registration " | ||
511 | "specified but not supported by adapter, " | ||
512 | "using riskier RPCRDMA_ALLPHYSICAL\n", | ||
513 | __func__); | ||
514 | memreg = RPCRDMA_ALLPHYSICAL; | ||
515 | #else | ||
516 | dprintk("RPC: %s: MTHCAFMR registration " | ||
517 | "specified but not supported by adapter, " | ||
518 | "using slower RPCRDMA_REGISTER\n", | ||
519 | __func__); | ||
520 | memreg = RPCRDMA_REGISTER; | ||
521 | #endif | ||
522 | } | ||
523 | break; | ||
524 | case RPCRDMA_FRMR: | ||
525 | /* Requires both frmr reg and local dma lkey */ | 543 | /* Requires both frmr reg and local dma lkey */ |
526 | if ((devattr.device_cap_flags & | 544 | if ((devattr.device_cap_flags & |
527 | (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != | 545 | (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != |
528 | (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { | 546 | (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { |
529 | #if RPCRDMA_PERSISTENT_REGISTRATION | ||
530 | dprintk("RPC: %s: FRMR registration " | 547 | dprintk("RPC: %s: FRMR registration " |
531 | "specified but not supported by adapter, " | 548 | "not supported by HCA\n", __func__); |
532 | "using riskier RPCRDMA_ALLPHYSICAL\n", | 549 | memreg = RPCRDMA_MTHCAFMR; |
533 | __func__); | 550 | } else { |
551 | /* Mind the ia limit on FRMR page list depth */ | ||
552 | ia->ri_max_frmr_depth = min_t(unsigned int, | ||
553 | RPCRDMA_MAX_DATA_SEGS, | ||
554 | devattr.max_fast_reg_page_list_len); | ||
555 | } | ||
556 | } | ||
557 | if (memreg == RPCRDMA_MTHCAFMR) { | ||
558 | if (!ia->ri_id->device->alloc_fmr) { | ||
559 | dprintk("RPC: %s: MTHCAFMR registration " | ||
560 | "not supported by HCA\n", __func__); | ||
561 | #if RPCRDMA_PERSISTENT_REGISTRATION | ||
534 | memreg = RPCRDMA_ALLPHYSICAL; | 562 | memreg = RPCRDMA_ALLPHYSICAL; |
535 | #else | 563 | #else |
536 | dprintk("RPC: %s: FRMR registration " | 564 | rc = -ENOMEM; |
537 | "specified but not supported by adapter, " | 565 | goto out2; |
538 | "using slower RPCRDMA_REGISTER\n", | ||
539 | __func__); | ||
540 | memreg = RPCRDMA_REGISTER; | ||
541 | #endif | 566 | #endif |
542 | } | 567 | } |
543 | break; | ||
544 | } | 568 | } |
545 | 569 | ||
546 | /* | 570 | /* |
@@ -552,8 +576,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) | |||
552 | * adapter. | 576 | * adapter. |
553 | */ | 577 | */ |
554 | switch (memreg) { | 578 | switch (memreg) { |
555 | case RPCRDMA_BOUNCEBUFFERS: | ||
556 | case RPCRDMA_REGISTER: | ||
557 | case RPCRDMA_FRMR: | 579 | case RPCRDMA_FRMR: |
558 | break; | 580 | break; |
559 | #if RPCRDMA_PERSISTENT_REGISTRATION | 581 | #if RPCRDMA_PERSISTENT_REGISTRATION |
@@ -563,30 +585,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) | |||
563 | IB_ACCESS_REMOTE_READ; | 585 | IB_ACCESS_REMOTE_READ; |
564 | goto register_setup; | 586 | goto register_setup; |
565 | #endif | 587 | #endif |
566 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
567 | case RPCRDMA_MEMWINDOWS: | ||
568 | mem_priv = IB_ACCESS_LOCAL_WRITE | | ||
569 | IB_ACCESS_MW_BIND; | ||
570 | goto register_setup; | ||
571 | case RPCRDMA_MTHCAFMR: | 588 | case RPCRDMA_MTHCAFMR: |
572 | if (ia->ri_have_dma_lkey) | 589 | if (ia->ri_have_dma_lkey) |
573 | break; | 590 | break; |
574 | mem_priv = IB_ACCESS_LOCAL_WRITE; | 591 | mem_priv = IB_ACCESS_LOCAL_WRITE; |
592 | #if RPCRDMA_PERSISTENT_REGISTRATION | ||
575 | register_setup: | 593 | register_setup: |
594 | #endif | ||
576 | ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); | 595 | ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); |
577 | if (IS_ERR(ia->ri_bind_mem)) { | 596 | if (IS_ERR(ia->ri_bind_mem)) { |
578 | printk(KERN_ALERT "%s: ib_get_dma_mr for " | 597 | printk(KERN_ALERT "%s: ib_get_dma_mr for " |
579 | "phys register failed with %lX\n\t" | 598 | "phys register failed with %lX\n", |
580 | "Will continue with degraded performance\n", | ||
581 | __func__, PTR_ERR(ia->ri_bind_mem)); | 599 | __func__, PTR_ERR(ia->ri_bind_mem)); |
582 | memreg = RPCRDMA_REGISTER; | 600 | rc = -ENOMEM; |
583 | ia->ri_bind_mem = NULL; | 601 | goto out2; |
584 | } | 602 | } |
585 | break; | 603 | break; |
586 | default: | 604 | default: |
587 | printk(KERN_ERR "%s: invalid memory registration mode %d\n", | 605 | printk(KERN_ERR "RPC: Unsupported memory " |
588 | __func__, memreg); | 606 | "registration mode: %d\n", memreg); |
589 | rc = -EINVAL; | 607 | rc = -ENOMEM; |
590 | goto out2; | 608 | goto out2; |
591 | } | 609 | } |
592 | dprintk("RPC: %s: memory registration strategy is %d\n", | 610 | dprintk("RPC: %s: memory registration strategy is %d\n", |
@@ -640,6 +658,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
640 | struct rpcrdma_create_data_internal *cdata) | 658 | struct rpcrdma_create_data_internal *cdata) |
641 | { | 659 | { |
642 | struct ib_device_attr devattr; | 660 | struct ib_device_attr devattr; |
661 | struct ib_cq *sendcq, *recvcq; | ||
643 | int rc, err; | 662 | int rc, err; |
644 | 663 | ||
645 | rc = ib_query_device(ia->ri_id->device, &devattr); | 664 | rc = ib_query_device(ia->ri_id->device, &devattr); |
@@ -659,32 +678,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
659 | ep->rep_attr.srq = NULL; | 678 | ep->rep_attr.srq = NULL; |
660 | ep->rep_attr.cap.max_send_wr = cdata->max_requests; | 679 | ep->rep_attr.cap.max_send_wr = cdata->max_requests; |
661 | switch (ia->ri_memreg_strategy) { | 680 | switch (ia->ri_memreg_strategy) { |
662 | case RPCRDMA_FRMR: | 681 | case RPCRDMA_FRMR: { |
682 | int depth = 7; | ||
683 | |||
663 | /* Add room for frmr register and invalidate WRs. | 684 | /* Add room for frmr register and invalidate WRs. |
664 | * 1. FRMR reg WR for head | 685 | * 1. FRMR reg WR for head |
665 | * 2. FRMR invalidate WR for head | 686 | * 2. FRMR invalidate WR for head |
666 | * 3. FRMR reg WR for pagelist | 687 | * 3. N FRMR reg WRs for pagelist |
667 | * 4. FRMR invalidate WR for pagelist | 688 | * 4. N FRMR invalidate WRs for pagelist |
668 | * 5. FRMR reg WR for tail | 689 | * 5. FRMR reg WR for tail |
669 | * 6. FRMR invalidate WR for tail | 690 | * 6. FRMR invalidate WR for tail |
670 | * 7. The RDMA_SEND WR | 691 | * 7. The RDMA_SEND WR |
671 | */ | 692 | */ |
672 | ep->rep_attr.cap.max_send_wr *= 7; | 693 | |
694 | /* Calculate N if the device max FRMR depth is smaller than | ||
695 | * RPCRDMA_MAX_DATA_SEGS. | ||
696 | */ | ||
697 | if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { | ||
698 | int delta = RPCRDMA_MAX_DATA_SEGS - | ||
699 | ia->ri_max_frmr_depth; | ||
700 | |||
701 | do { | ||
702 | depth += 2; /* FRMR reg + invalidate */ | ||
703 | delta -= ia->ri_max_frmr_depth; | ||
704 | } while (delta > 0); | ||
705 | |||
706 | } | ||
707 | ep->rep_attr.cap.max_send_wr *= depth; | ||
673 | if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { | 708 | if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { |
674 | cdata->max_requests = devattr.max_qp_wr / 7; | 709 | cdata->max_requests = devattr.max_qp_wr / depth; |
675 | if (!cdata->max_requests) | 710 | if (!cdata->max_requests) |
676 | return -EINVAL; | 711 | return -EINVAL; |
677 | ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; | 712 | ep->rep_attr.cap.max_send_wr = cdata->max_requests * |
713 | depth; | ||
678 | } | 714 | } |
679 | break; | 715 | break; |
680 | case RPCRDMA_MEMWINDOWS_ASYNC: | 716 | } |
681 | case RPCRDMA_MEMWINDOWS: | ||
682 | /* Add room for mw_binds+unbinds - overkill! */ | ||
683 | ep->rep_attr.cap.max_send_wr++; | ||
684 | ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); | ||
685 | if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) | ||
686 | return -EINVAL; | ||
687 | break; | ||
688 | default: | 717 | default: |
689 | break; | 718 | break; |
690 | } | 719 | } |
@@ -705,46 +734,51 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
705 | ep->rep_attr.cap.max_recv_sge); | 734 | ep->rep_attr.cap.max_recv_sge); |
706 | 735 | ||
707 | /* set trigger for requesting send completion */ | 736 | /* set trigger for requesting send completion */ |
708 | ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; | 737 | ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; |
709 | switch (ia->ri_memreg_strategy) { | ||
710 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
711 | case RPCRDMA_MEMWINDOWS: | ||
712 | ep->rep_cqinit -= RPCRDMA_MAX_SEGS; | ||
713 | break; | ||
714 | default: | ||
715 | break; | ||
716 | } | ||
717 | if (ep->rep_cqinit <= 2) | 738 | if (ep->rep_cqinit <= 2) |
718 | ep->rep_cqinit = 0; | 739 | ep->rep_cqinit = 0; |
719 | INIT_CQCOUNT(ep); | 740 | INIT_CQCOUNT(ep); |
720 | ep->rep_ia = ia; | 741 | ep->rep_ia = ia; |
721 | init_waitqueue_head(&ep->rep_connect_wait); | 742 | init_waitqueue_head(&ep->rep_connect_wait); |
743 | INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); | ||
722 | 744 | ||
723 | /* | 745 | sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, |
724 | * Create a single cq for receive dto and mw_bind (only ever | 746 | rpcrdma_cq_async_error_upcall, ep, |
725 | * care about unbind, really). Send completions are suppressed. | ||
726 | * Use single threaded tasklet upcalls to maintain ordering. | ||
727 | */ | ||
728 | ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, | ||
729 | rpcrdma_cq_async_error_upcall, NULL, | ||
730 | ep->rep_attr.cap.max_recv_wr + | ||
731 | ep->rep_attr.cap.max_send_wr + 1, 0); | 747 | ep->rep_attr.cap.max_send_wr + 1, 0); |
732 | if (IS_ERR(ep->rep_cq)) { | 748 | if (IS_ERR(sendcq)) { |
733 | rc = PTR_ERR(ep->rep_cq); | 749 | rc = PTR_ERR(sendcq); |
734 | dprintk("RPC: %s: ib_create_cq failed: %i\n", | 750 | dprintk("RPC: %s: failed to create send CQ: %i\n", |
735 | __func__, rc); | 751 | __func__, rc); |
736 | goto out1; | 752 | goto out1; |
737 | } | 753 | } |
738 | 754 | ||
739 | rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); | 755 | rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); |
756 | if (rc) { | ||
757 | dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", | ||
758 | __func__, rc); | ||
759 | goto out2; | ||
760 | } | ||
761 | |||
762 | recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, | ||
763 | rpcrdma_cq_async_error_upcall, ep, | ||
764 | ep->rep_attr.cap.max_recv_wr + 1, 0); | ||
765 | if (IS_ERR(recvcq)) { | ||
766 | rc = PTR_ERR(recvcq); | ||
767 | dprintk("RPC: %s: failed to create recv CQ: %i\n", | ||
768 | __func__, rc); | ||
769 | goto out2; | ||
770 | } | ||
771 | |||
772 | rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); | ||
740 | if (rc) { | 773 | if (rc) { |
741 | dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", | 774 | dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", |
742 | __func__, rc); | 775 | __func__, rc); |
776 | ib_destroy_cq(recvcq); | ||
743 | goto out2; | 777 | goto out2; |
744 | } | 778 | } |
745 | 779 | ||
746 | ep->rep_attr.send_cq = ep->rep_cq; | 780 | ep->rep_attr.send_cq = sendcq; |
747 | ep->rep_attr.recv_cq = ep->rep_cq; | 781 | ep->rep_attr.recv_cq = recvcq; |
748 | 782 | ||
749 | /* Initialize cma parameters */ | 783 | /* Initialize cma parameters */ |
750 | 784 | ||
@@ -754,9 +788,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
754 | 788 | ||
755 | /* Client offers RDMA Read but does not initiate */ | 789 | /* Client offers RDMA Read but does not initiate */ |
756 | ep->rep_remote_cma.initiator_depth = 0; | 790 | ep->rep_remote_cma.initiator_depth = 0; |
757 | if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) | 791 | if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ |
758 | ep->rep_remote_cma.responder_resources = 0; | ||
759 | else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ | ||
760 | ep->rep_remote_cma.responder_resources = 32; | 792 | ep->rep_remote_cma.responder_resources = 32; |
761 | else | 793 | else |
762 | ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; | 794 | ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; |
@@ -768,7 +800,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
768 | return 0; | 800 | return 0; |
769 | 801 | ||
770 | out2: | 802 | out2: |
771 | err = ib_destroy_cq(ep->rep_cq); | 803 | err = ib_destroy_cq(sendcq); |
772 | if (err) | 804 | if (err) |
773 | dprintk("RPC: %s: ib_destroy_cq returned %i\n", | 805 | dprintk("RPC: %s: ib_destroy_cq returned %i\n", |
774 | __func__, err); | 806 | __func__, err); |
@@ -782,11 +814,8 @@ out1: | |||
782 | * Disconnect and destroy endpoint. After this, the only | 814 | * Disconnect and destroy endpoint. After this, the only |
783 | * valid operations on the ep are to free it (if dynamically | 815 | * valid operations on the ep are to free it (if dynamically |
784 | * allocated) or re-create it. | 816 | * allocated) or re-create it. |
785 | * | ||
786 | * The caller's error handling must be sure to not leak the endpoint | ||
787 | * if this function fails. | ||
788 | */ | 817 | */ |
789 | int | 818 | void |
790 | rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | 819 | rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) |
791 | { | 820 | { |
792 | int rc; | 821 | int rc; |
@@ -794,6 +823,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
794 | dprintk("RPC: %s: entering, connected is %d\n", | 823 | dprintk("RPC: %s: entering, connected is %d\n", |
795 | __func__, ep->rep_connected); | 824 | __func__, ep->rep_connected); |
796 | 825 | ||
826 | cancel_delayed_work_sync(&ep->rep_connect_worker); | ||
827 | |||
797 | if (ia->ri_id->qp) { | 828 | if (ia->ri_id->qp) { |
798 | rc = rpcrdma_ep_disconnect(ep, ia); | 829 | rc = rpcrdma_ep_disconnect(ep, ia); |
799 | if (rc) | 830 | if (rc) |
@@ -809,13 +840,17 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
809 | ep->rep_pad_mr = NULL; | 840 | ep->rep_pad_mr = NULL; |
810 | } | 841 | } |
811 | 842 | ||
812 | rpcrdma_clean_cq(ep->rep_cq); | 843 | rpcrdma_clean_cq(ep->rep_attr.recv_cq); |
813 | rc = ib_destroy_cq(ep->rep_cq); | 844 | rc = ib_destroy_cq(ep->rep_attr.recv_cq); |
814 | if (rc) | 845 | if (rc) |
815 | dprintk("RPC: %s: ib_destroy_cq returned %i\n", | 846 | dprintk("RPC: %s: ib_destroy_cq returned %i\n", |
816 | __func__, rc); | 847 | __func__, rc); |
817 | 848 | ||
818 | return rc; | 849 | rpcrdma_clean_cq(ep->rep_attr.send_cq); |
850 | rc = ib_destroy_cq(ep->rep_attr.send_cq); | ||
851 | if (rc) | ||
852 | dprintk("RPC: %s: ib_destroy_cq returned %i\n", | ||
853 | __func__, rc); | ||
819 | } | 854 | } |
820 | 855 | ||
821 | /* | 856 | /* |
@@ -831,17 +866,20 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
831 | if (ep->rep_connected != 0) { | 866 | if (ep->rep_connected != 0) { |
832 | struct rpcrdma_xprt *xprt; | 867 | struct rpcrdma_xprt *xprt; |
833 | retry: | 868 | retry: |
869 | dprintk("RPC: %s: reconnecting...\n", __func__); | ||
834 | rc = rpcrdma_ep_disconnect(ep, ia); | 870 | rc = rpcrdma_ep_disconnect(ep, ia); |
835 | if (rc && rc != -ENOTCONN) | 871 | if (rc && rc != -ENOTCONN) |
836 | dprintk("RPC: %s: rpcrdma_ep_disconnect" | 872 | dprintk("RPC: %s: rpcrdma_ep_disconnect" |
837 | " status %i\n", __func__, rc); | 873 | " status %i\n", __func__, rc); |
838 | rpcrdma_clean_cq(ep->rep_cq); | 874 | |
875 | rpcrdma_clean_cq(ep->rep_attr.recv_cq); | ||
876 | rpcrdma_clean_cq(ep->rep_attr.send_cq); | ||
839 | 877 | ||
840 | xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); | 878 | xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); |
841 | id = rpcrdma_create_id(xprt, ia, | 879 | id = rpcrdma_create_id(xprt, ia, |
842 | (struct sockaddr *)&xprt->rx_data.addr); | 880 | (struct sockaddr *)&xprt->rx_data.addr); |
843 | if (IS_ERR(id)) { | 881 | if (IS_ERR(id)) { |
844 | rc = PTR_ERR(id); | 882 | rc = -EHOSTUNREACH; |
845 | goto out; | 883 | goto out; |
846 | } | 884 | } |
847 | /* TEMP TEMP TEMP - fail if new device: | 885 | /* TEMP TEMP TEMP - fail if new device: |
@@ -855,35 +893,32 @@ retry: | |||
855 | printk("RPC: %s: can't reconnect on " | 893 | printk("RPC: %s: can't reconnect on " |
856 | "different device!\n", __func__); | 894 | "different device!\n", __func__); |
857 | rdma_destroy_id(id); | 895 | rdma_destroy_id(id); |
858 | rc = -ENETDOWN; | 896 | rc = -ENETUNREACH; |
859 | goto out; | 897 | goto out; |
860 | } | 898 | } |
861 | /* END TEMP */ | 899 | /* END TEMP */ |
900 | rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); | ||
901 | if (rc) { | ||
902 | dprintk("RPC: %s: rdma_create_qp failed %i\n", | ||
903 | __func__, rc); | ||
904 | rdma_destroy_id(id); | ||
905 | rc = -ENETUNREACH; | ||
906 | goto out; | ||
907 | } | ||
862 | rdma_destroy_qp(ia->ri_id); | 908 | rdma_destroy_qp(ia->ri_id); |
863 | rdma_destroy_id(ia->ri_id); | 909 | rdma_destroy_id(ia->ri_id); |
864 | ia->ri_id = id; | 910 | ia->ri_id = id; |
911 | } else { | ||
912 | dprintk("RPC: %s: connecting...\n", __func__); | ||
913 | rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); | ||
914 | if (rc) { | ||
915 | dprintk("RPC: %s: rdma_create_qp failed %i\n", | ||
916 | __func__, rc); | ||
917 | /* do not update ep->rep_connected */ | ||
918 | return -ENETUNREACH; | ||
919 | } | ||
865 | } | 920 | } |
866 | 921 | ||
867 | rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); | ||
868 | if (rc) { | ||
869 | dprintk("RPC: %s: rdma_create_qp failed %i\n", | ||
870 | __func__, rc); | ||
871 | goto out; | ||
872 | } | ||
873 | |||
874 | /* XXX Tavor device performs badly with 2K MTU! */ | ||
875 | if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { | ||
876 | struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); | ||
877 | if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && | ||
878 | (pcid->vendor == PCI_VENDOR_ID_MELLANOX || | ||
879 | pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { | ||
880 | struct ib_qp_attr attr = { | ||
881 | .path_mtu = IB_MTU_1024 | ||
882 | }; | ||
883 | rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); | ||
884 | } | ||
885 | } | ||
886 | |||
887 | ep->rep_connected = 0; | 922 | ep->rep_connected = 0; |
888 | 923 | ||
889 | rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); | 924 | rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); |
@@ -944,7 +979,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
944 | { | 979 | { |
945 | int rc; | 980 | int rc; |
946 | 981 | ||
947 | rpcrdma_clean_cq(ep->rep_cq); | 982 | rpcrdma_clean_cq(ep->rep_attr.recv_cq); |
983 | rpcrdma_clean_cq(ep->rep_attr.send_cq); | ||
948 | rc = rdma_disconnect(ia->ri_id); | 984 | rc = rdma_disconnect(ia->ri_id); |
949 | if (!rc) { | 985 | if (!rc) { |
950 | /* returns without wait if not connected */ | 986 | /* returns without wait if not connected */ |
@@ -967,7 +1003,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, | |||
967 | struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) | 1003 | struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) |
968 | { | 1004 | { |
969 | char *p; | 1005 | char *p; |
970 | size_t len; | 1006 | size_t len, rlen, wlen; |
971 | int i, rc; | 1007 | int i, rc; |
972 | struct rpcrdma_mw *r; | 1008 | struct rpcrdma_mw *r; |
973 | 1009 | ||
@@ -997,11 +1033,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, | |||
997 | len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * | 1033 | len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * |
998 | sizeof(struct rpcrdma_mw); | 1034 | sizeof(struct rpcrdma_mw); |
999 | break; | 1035 | break; |
1000 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
1001 | case RPCRDMA_MEMWINDOWS: | ||
1002 | len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * | ||
1003 | sizeof(struct rpcrdma_mw); | ||
1004 | break; | ||
1005 | default: | 1036 | default: |
1006 | break; | 1037 | break; |
1007 | } | 1038 | } |
@@ -1032,32 +1063,29 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, | |||
1032 | } | 1063 | } |
1033 | p += cdata->padding; | 1064 | p += cdata->padding; |
1034 | 1065 | ||
1035 | /* | ||
1036 | * Allocate the fmr's, or mw's for mw_bind chunk registration. | ||
1037 | * We "cycle" the mw's in order to minimize rkey reuse, | ||
1038 | * and also reduce unbind-to-bind collision. | ||
1039 | */ | ||
1040 | INIT_LIST_HEAD(&buf->rb_mws); | 1066 | INIT_LIST_HEAD(&buf->rb_mws); |
1041 | r = (struct rpcrdma_mw *)p; | 1067 | r = (struct rpcrdma_mw *)p; |
1042 | switch (ia->ri_memreg_strategy) { | 1068 | switch (ia->ri_memreg_strategy) { |
1043 | case RPCRDMA_FRMR: | 1069 | case RPCRDMA_FRMR: |
1044 | for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { | 1070 | for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { |
1045 | r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, | 1071 | r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, |
1046 | RPCRDMA_MAX_SEGS); | 1072 | ia->ri_max_frmr_depth); |
1047 | if (IS_ERR(r->r.frmr.fr_mr)) { | 1073 | if (IS_ERR(r->r.frmr.fr_mr)) { |
1048 | rc = PTR_ERR(r->r.frmr.fr_mr); | 1074 | rc = PTR_ERR(r->r.frmr.fr_mr); |
1049 | dprintk("RPC: %s: ib_alloc_fast_reg_mr" | 1075 | dprintk("RPC: %s: ib_alloc_fast_reg_mr" |
1050 | " failed %i\n", __func__, rc); | 1076 | " failed %i\n", __func__, rc); |
1051 | goto out; | 1077 | goto out; |
1052 | } | 1078 | } |
1053 | r->r.frmr.fr_pgl = | 1079 | r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( |
1054 | ib_alloc_fast_reg_page_list(ia->ri_id->device, | 1080 | ia->ri_id->device, |
1055 | RPCRDMA_MAX_SEGS); | 1081 | ia->ri_max_frmr_depth); |
1056 | if (IS_ERR(r->r.frmr.fr_pgl)) { | 1082 | if (IS_ERR(r->r.frmr.fr_pgl)) { |
1057 | rc = PTR_ERR(r->r.frmr.fr_pgl); | 1083 | rc = PTR_ERR(r->r.frmr.fr_pgl); |
1058 | dprintk("RPC: %s: " | 1084 | dprintk("RPC: %s: " |
1059 | "ib_alloc_fast_reg_page_list " | 1085 | "ib_alloc_fast_reg_page_list " |
1060 | "failed %i\n", __func__, rc); | 1086 | "failed %i\n", __func__, rc); |
1087 | |||
1088 | ib_dereg_mr(r->r.frmr.fr_mr); | ||
1061 | goto out; | 1089 | goto out; |
1062 | } | 1090 | } |
1063 | list_add(&r->mw_list, &buf->rb_mws); | 1091 | list_add(&r->mw_list, &buf->rb_mws); |
@@ -1082,21 +1110,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, | |||
1082 | ++r; | 1110 | ++r; |
1083 | } | 1111 | } |
1084 | break; | 1112 | break; |
1085 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
1086 | case RPCRDMA_MEMWINDOWS: | ||
1087 | /* Allocate one extra request's worth, for full cycling */ | ||
1088 | for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { | ||
1089 | r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1); | ||
1090 | if (IS_ERR(r->r.mw)) { | ||
1091 | rc = PTR_ERR(r->r.mw); | ||
1092 | dprintk("RPC: %s: ib_alloc_mw" | ||
1093 | " failed %i\n", __func__, rc); | ||
1094 | goto out; | ||
1095 | } | ||
1096 | list_add(&r->mw_list, &buf->rb_mws); | ||
1097 | ++r; | ||
1098 | } | ||
1099 | break; | ||
1100 | default: | 1113 | default: |
1101 | break; | 1114 | break; |
1102 | } | 1115 | } |
@@ -1105,16 +1118,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, | |||
1105 | * Allocate/init the request/reply buffers. Doing this | 1118 | * Allocate/init the request/reply buffers. Doing this |
1106 | * using kmalloc for now -- one for each buf. | 1119 | * using kmalloc for now -- one for each buf. |
1107 | */ | 1120 | */ |
1121 | wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); | ||
1122 | rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); | ||
1123 | dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", | ||
1124 | __func__, wlen, rlen); | ||
1125 | |||
1108 | for (i = 0; i < buf->rb_max_requests; i++) { | 1126 | for (i = 0; i < buf->rb_max_requests; i++) { |
1109 | struct rpcrdma_req *req; | 1127 | struct rpcrdma_req *req; |
1110 | struct rpcrdma_rep *rep; | 1128 | struct rpcrdma_rep *rep; |
1111 | 1129 | ||
1112 | len = cdata->inline_wsize + sizeof(struct rpcrdma_req); | 1130 | req = kmalloc(wlen, GFP_KERNEL); |
1113 | /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ | ||
1114 | /* Typical ~2400b, so rounding up saves work later */ | ||
1115 | if (len < 4096) | ||
1116 | len = 4096; | ||
1117 | req = kmalloc(len, GFP_KERNEL); | ||
1118 | if (req == NULL) { | 1131 | if (req == NULL) { |
1119 | dprintk("RPC: %s: request buffer %d alloc" | 1132 | dprintk("RPC: %s: request buffer %d alloc" |
1120 | " failed\n", __func__, i); | 1133 | " failed\n", __func__, i); |
@@ -1126,16 +1139,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, | |||
1126 | buf->rb_send_bufs[i]->rl_buffer = buf; | 1139 | buf->rb_send_bufs[i]->rl_buffer = buf; |
1127 | 1140 | ||
1128 | rc = rpcrdma_register_internal(ia, req->rl_base, | 1141 | rc = rpcrdma_register_internal(ia, req->rl_base, |
1129 | len - offsetof(struct rpcrdma_req, rl_base), | 1142 | wlen - offsetof(struct rpcrdma_req, rl_base), |
1130 | &buf->rb_send_bufs[i]->rl_handle, | 1143 | &buf->rb_send_bufs[i]->rl_handle, |
1131 | &buf->rb_send_bufs[i]->rl_iov); | 1144 | &buf->rb_send_bufs[i]->rl_iov); |
1132 | if (rc) | 1145 | if (rc) |
1133 | goto out; | 1146 | goto out; |
1134 | 1147 | ||
1135 | buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); | 1148 | buf->rb_send_bufs[i]->rl_size = wlen - |
1149 | sizeof(struct rpcrdma_req); | ||
1136 | 1150 | ||
1137 | len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); | 1151 | rep = kmalloc(rlen, GFP_KERNEL); |
1138 | rep = kmalloc(len, GFP_KERNEL); | ||
1139 | if (rep == NULL) { | 1152 | if (rep == NULL) { |
1140 | dprintk("RPC: %s: reply buffer %d alloc failed\n", | 1153 | dprintk("RPC: %s: reply buffer %d alloc failed\n", |
1141 | __func__, i); | 1154 | __func__, i); |
@@ -1145,10 +1158,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, | |||
1145 | memset(rep, 0, sizeof(struct rpcrdma_rep)); | 1158 | memset(rep, 0, sizeof(struct rpcrdma_rep)); |
1146 | buf->rb_recv_bufs[i] = rep; | 1159 | buf->rb_recv_bufs[i] = rep; |
1147 | buf->rb_recv_bufs[i]->rr_buffer = buf; | 1160 | buf->rb_recv_bufs[i]->rr_buffer = buf; |
1148 | init_waitqueue_head(&rep->rr_unbind); | ||
1149 | 1161 | ||
1150 | rc = rpcrdma_register_internal(ia, rep->rr_base, | 1162 | rc = rpcrdma_register_internal(ia, rep->rr_base, |
1151 | len - offsetof(struct rpcrdma_rep, rr_base), | 1163 | rlen - offsetof(struct rpcrdma_rep, rr_base), |
1152 | &buf->rb_recv_bufs[i]->rr_handle, | 1164 | &buf->rb_recv_bufs[i]->rr_handle, |
1153 | &buf->rb_recv_bufs[i]->rr_iov); | 1165 | &buf->rb_recv_bufs[i]->rr_iov); |
1154 | if (rc) | 1166 | if (rc) |
@@ -1179,7 +1191,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) | |||
1179 | 1191 | ||
1180 | /* clean up in reverse order from create | 1192 | /* clean up in reverse order from create |
1181 | * 1. recv mr memory (mr free, then kfree) | 1193 | * 1. recv mr memory (mr free, then kfree) |
1182 | * 1a. bind mw memory | ||
1183 | * 2. send mr memory (mr free, then kfree) | 1194 | * 2. send mr memory (mr free, then kfree) |
1184 | * 3. padding (if any) [moved to rpcrdma_ep_destroy] | 1195 | * 3. padding (if any) [moved to rpcrdma_ep_destroy] |
1185 | * 4. arrays | 1196 | * 4. arrays |
@@ -1194,41 +1205,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) | |||
1194 | kfree(buf->rb_recv_bufs[i]); | 1205 | kfree(buf->rb_recv_bufs[i]); |
1195 | } | 1206 | } |
1196 | if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { | 1207 | if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { |
1197 | while (!list_empty(&buf->rb_mws)) { | ||
1198 | r = list_entry(buf->rb_mws.next, | ||
1199 | struct rpcrdma_mw, mw_list); | ||
1200 | list_del(&r->mw_list); | ||
1201 | switch (ia->ri_memreg_strategy) { | ||
1202 | case RPCRDMA_FRMR: | ||
1203 | rc = ib_dereg_mr(r->r.frmr.fr_mr); | ||
1204 | if (rc) | ||
1205 | dprintk("RPC: %s:" | ||
1206 | " ib_dereg_mr" | ||
1207 | " failed %i\n", | ||
1208 | __func__, rc); | ||
1209 | ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); | ||
1210 | break; | ||
1211 | case RPCRDMA_MTHCAFMR: | ||
1212 | rc = ib_dealloc_fmr(r->r.fmr); | ||
1213 | if (rc) | ||
1214 | dprintk("RPC: %s:" | ||
1215 | " ib_dealloc_fmr" | ||
1216 | " failed %i\n", | ||
1217 | __func__, rc); | ||
1218 | break; | ||
1219 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
1220 | case RPCRDMA_MEMWINDOWS: | ||
1221 | rc = ib_dealloc_mw(r->r.mw); | ||
1222 | if (rc) | ||
1223 | dprintk("RPC: %s:" | ||
1224 | " ib_dealloc_mw" | ||
1225 | " failed %i\n", | ||
1226 | __func__, rc); | ||
1227 | break; | ||
1228 | default: | ||
1229 | break; | ||
1230 | } | ||
1231 | } | ||
1232 | rpcrdma_deregister_internal(ia, | 1208 | rpcrdma_deregister_internal(ia, |
1233 | buf->rb_send_bufs[i]->rl_handle, | 1209 | buf->rb_send_bufs[i]->rl_handle, |
1234 | &buf->rb_send_bufs[i]->rl_iov); | 1210 | &buf->rb_send_bufs[i]->rl_iov); |
@@ -1236,6 +1212,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) | |||
1236 | } | 1212 | } |
1237 | } | 1213 | } |
1238 | 1214 | ||
1215 | while (!list_empty(&buf->rb_mws)) { | ||
1216 | r = list_entry(buf->rb_mws.next, | ||
1217 | struct rpcrdma_mw, mw_list); | ||
1218 | list_del(&r->mw_list); | ||
1219 | switch (ia->ri_memreg_strategy) { | ||
1220 | case RPCRDMA_FRMR: | ||
1221 | rc = ib_dereg_mr(r->r.frmr.fr_mr); | ||
1222 | if (rc) | ||
1223 | dprintk("RPC: %s:" | ||
1224 | " ib_dereg_mr" | ||
1225 | " failed %i\n", | ||
1226 | __func__, rc); | ||
1227 | ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); | ||
1228 | break; | ||
1229 | case RPCRDMA_MTHCAFMR: | ||
1230 | rc = ib_dealloc_fmr(r->r.fmr); | ||
1231 | if (rc) | ||
1232 | dprintk("RPC: %s:" | ||
1233 | " ib_dealloc_fmr" | ||
1234 | " failed %i\n", | ||
1235 | __func__, rc); | ||
1236 | break; | ||
1237 | default: | ||
1238 | break; | ||
1239 | } | ||
1240 | } | ||
1241 | |||
1239 | kfree(buf->rb_pool); | 1242 | kfree(buf->rb_pool); |
1240 | } | 1243 | } |
1241 | 1244 | ||
@@ -1299,21 +1302,17 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) | |||
1299 | int i; | 1302 | int i; |
1300 | unsigned long flags; | 1303 | unsigned long flags; |
1301 | 1304 | ||
1302 | BUG_ON(req->rl_nchunks != 0); | ||
1303 | spin_lock_irqsave(&buffers->rb_lock, flags); | 1305 | spin_lock_irqsave(&buffers->rb_lock, flags); |
1304 | buffers->rb_send_bufs[--buffers->rb_send_index] = req; | 1306 | buffers->rb_send_bufs[--buffers->rb_send_index] = req; |
1305 | req->rl_niovs = 0; | 1307 | req->rl_niovs = 0; |
1306 | if (req->rl_reply) { | 1308 | if (req->rl_reply) { |
1307 | buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; | 1309 | buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; |
1308 | init_waitqueue_head(&req->rl_reply->rr_unbind); | ||
1309 | req->rl_reply->rr_func = NULL; | 1310 | req->rl_reply->rr_func = NULL; |
1310 | req->rl_reply = NULL; | 1311 | req->rl_reply = NULL; |
1311 | } | 1312 | } |
1312 | switch (ia->ri_memreg_strategy) { | 1313 | switch (ia->ri_memreg_strategy) { |
1313 | case RPCRDMA_FRMR: | 1314 | case RPCRDMA_FRMR: |
1314 | case RPCRDMA_MTHCAFMR: | 1315 | case RPCRDMA_MTHCAFMR: |
1315 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
1316 | case RPCRDMA_MEMWINDOWS: | ||
1317 | /* | 1316 | /* |
1318 | * Cycle mw's back in reverse order, and "spin" them. | 1317 | * Cycle mw's back in reverse order, and "spin" them. |
1319 | * This delays and scrambles reuse as much as possible. | 1318 | * This delays and scrambles reuse as much as possible. |
@@ -1358,8 +1357,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) | |||
1358 | 1357 | ||
1359 | /* | 1358 | /* |
1360 | * Put reply buffers back into pool when not attached to | 1359 | * Put reply buffers back into pool when not attached to |
1361 | * request. This happens in error conditions, and when | 1360 | * request. This happens in error conditions. |
1362 | * aborting unbinds. Pre-decrement counter/array index. | ||
1363 | */ | 1361 | */ |
1364 | void | 1362 | void |
1365 | rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) | 1363 | rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) |
@@ -1498,8 +1496,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, | |||
1498 | seg1->mr_offset -= pageoff; /* start of page */ | 1496 | seg1->mr_offset -= pageoff; /* start of page */ |
1499 | seg1->mr_len += pageoff; | 1497 | seg1->mr_len += pageoff; |
1500 | len = -pageoff; | 1498 | len = -pageoff; |
1501 | if (*nsegs > RPCRDMA_MAX_DATA_SEGS) | 1499 | if (*nsegs > ia->ri_max_frmr_depth) |
1502 | *nsegs = RPCRDMA_MAX_DATA_SEGS; | 1500 | *nsegs = ia->ri_max_frmr_depth; |
1503 | for (page_no = i = 0; i < *nsegs;) { | 1501 | for (page_no = i = 0; i < *nsegs;) { |
1504 | rpcrdma_map_one(ia, seg, writing); | 1502 | rpcrdma_map_one(ia, seg, writing); |
1505 | pa = seg->mr_dma; | 1503 | pa = seg->mr_dma; |
@@ -1536,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, | |||
1536 | } else | 1534 | } else |
1537 | post_wr = &frmr_wr; | 1535 | post_wr = &frmr_wr; |
1538 | 1536 | ||
1539 | /* Bump the key */ | ||
1540 | key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); | ||
1541 | ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); | ||
1542 | |||
1543 | /* Prepare FRMR WR */ | 1537 | /* Prepare FRMR WR */ |
1544 | memset(&frmr_wr, 0, sizeof frmr_wr); | 1538 | memset(&frmr_wr, 0, sizeof frmr_wr); |
1545 | frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; | 1539 | frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; |
@@ -1550,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, | |||
1550 | frmr_wr.wr.fast_reg.page_list_len = page_no; | 1544 | frmr_wr.wr.fast_reg.page_list_len = page_no; |
1551 | frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; | 1545 | frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; |
1552 | frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; | 1546 | frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; |
1553 | BUG_ON(frmr_wr.wr.fast_reg.length < len); | 1547 | if (frmr_wr.wr.fast_reg.length < len) { |
1548 | while (seg1->mr_nsegs--) | ||
1549 | rpcrdma_unmap_one(ia, seg++); | ||
1550 | return -EIO; | ||
1551 | } | ||
1552 | |||
1553 | /* Bump the key */ | ||
1554 | key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); | ||
1555 | ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); | ||
1556 | |||
1554 | frmr_wr.wr.fast_reg.access_flags = (writing ? | 1557 | frmr_wr.wr.fast_reg.access_flags = (writing ? |
1555 | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : | 1558 | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : |
1556 | IB_ACCESS_REMOTE_READ); | 1559 | IB_ACCESS_REMOTE_READ); |
@@ -1661,135 +1664,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, | |||
1661 | return rc; | 1664 | return rc; |
1662 | } | 1665 | } |
1663 | 1666 | ||
1664 | static int | ||
1665 | rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, | ||
1666 | int *nsegs, int writing, struct rpcrdma_ia *ia, | ||
1667 | struct rpcrdma_xprt *r_xprt) | ||
1668 | { | ||
1669 | int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : | ||
1670 | IB_ACCESS_REMOTE_READ); | ||
1671 | struct ib_mw_bind param; | ||
1672 | int rc; | ||
1673 | |||
1674 | *nsegs = 1; | ||
1675 | rpcrdma_map_one(ia, seg, writing); | ||
1676 | param.bind_info.mr = ia->ri_bind_mem; | ||
1677 | param.wr_id = 0ULL; /* no send cookie */ | ||
1678 | param.bind_info.addr = seg->mr_dma; | ||
1679 | param.bind_info.length = seg->mr_len; | ||
1680 | param.send_flags = 0; | ||
1681 | param.bind_info.mw_access_flags = mem_priv; | ||
1682 | |||
1683 | DECR_CQCOUNT(&r_xprt->rx_ep); | ||
1684 | rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); | ||
1685 | if (rc) { | ||
1686 | dprintk("RPC: %s: failed ib_bind_mw " | ||
1687 | "%u@0x%llx status %i\n", | ||
1688 | __func__, seg->mr_len, | ||
1689 | (unsigned long long)seg->mr_dma, rc); | ||
1690 | rpcrdma_unmap_one(ia, seg); | ||
1691 | } else { | ||
1692 | seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; | ||
1693 | seg->mr_base = param.bind_info.addr; | ||
1694 | seg->mr_nsegs = 1; | ||
1695 | } | ||
1696 | return rc; | ||
1697 | } | ||
1698 | |||
1699 | static int | ||
1700 | rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, | ||
1701 | struct rpcrdma_ia *ia, | ||
1702 | struct rpcrdma_xprt *r_xprt, void **r) | ||
1703 | { | ||
1704 | struct ib_mw_bind param; | ||
1705 | LIST_HEAD(l); | ||
1706 | int rc; | ||
1707 | |||
1708 | BUG_ON(seg->mr_nsegs != 1); | ||
1709 | param.bind_info.mr = ia->ri_bind_mem; | ||
1710 | param.bind_info.addr = 0ULL; /* unbind */ | ||
1711 | param.bind_info.length = 0; | ||
1712 | param.bind_info.mw_access_flags = 0; | ||
1713 | if (*r) { | ||
1714 | param.wr_id = (u64) (unsigned long) *r; | ||
1715 | param.send_flags = IB_SEND_SIGNALED; | ||
1716 | INIT_CQCOUNT(&r_xprt->rx_ep); | ||
1717 | } else { | ||
1718 | param.wr_id = 0ULL; | ||
1719 | param.send_flags = 0; | ||
1720 | DECR_CQCOUNT(&r_xprt->rx_ep); | ||
1721 | } | ||
1722 | rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); | ||
1723 | rpcrdma_unmap_one(ia, seg); | ||
1724 | if (rc) | ||
1725 | dprintk("RPC: %s: failed ib_(un)bind_mw," | ||
1726 | " status %i\n", __func__, rc); | ||
1727 | else | ||
1728 | *r = NULL; /* will upcall on completion */ | ||
1729 | return rc; | ||
1730 | } | ||
1731 | |||
1732 | static int | ||
1733 | rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, | ||
1734 | int *nsegs, int writing, struct rpcrdma_ia *ia) | ||
1735 | { | ||
1736 | int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : | ||
1737 | IB_ACCESS_REMOTE_READ); | ||
1738 | struct rpcrdma_mr_seg *seg1 = seg; | ||
1739 | struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; | ||
1740 | int len, i, rc = 0; | ||
1741 | |||
1742 | if (*nsegs > RPCRDMA_MAX_DATA_SEGS) | ||
1743 | *nsegs = RPCRDMA_MAX_DATA_SEGS; | ||
1744 | for (len = 0, i = 0; i < *nsegs;) { | ||
1745 | rpcrdma_map_one(ia, seg, writing); | ||
1746 | ipb[i].addr = seg->mr_dma; | ||
1747 | ipb[i].size = seg->mr_len; | ||
1748 | len += seg->mr_len; | ||
1749 | ++seg; | ||
1750 | ++i; | ||
1751 | /* Check for holes */ | ||
1752 | if ((i < *nsegs && offset_in_page(seg->mr_offset)) || | ||
1753 | offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) | ||
1754 | break; | ||
1755 | } | ||
1756 | seg1->mr_base = seg1->mr_dma; | ||
1757 | seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, | ||
1758 | ipb, i, mem_priv, &seg1->mr_base); | ||
1759 | if (IS_ERR(seg1->mr_chunk.rl_mr)) { | ||
1760 | rc = PTR_ERR(seg1->mr_chunk.rl_mr); | ||
1761 | dprintk("RPC: %s: failed ib_reg_phys_mr " | ||
1762 | "%u@0x%llx (%d)... status %i\n", | ||
1763 | __func__, len, | ||
1764 | (unsigned long long)seg1->mr_dma, i, rc); | ||
1765 | while (i--) | ||
1766 | rpcrdma_unmap_one(ia, --seg); | ||
1767 | } else { | ||
1768 | seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; | ||
1769 | seg1->mr_nsegs = i; | ||
1770 | seg1->mr_len = len; | ||
1771 | } | ||
1772 | *nsegs = i; | ||
1773 | return rc; | ||
1774 | } | ||
1775 | |||
1776 | static int | ||
1777 | rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, | ||
1778 | struct rpcrdma_ia *ia) | ||
1779 | { | ||
1780 | struct rpcrdma_mr_seg *seg1 = seg; | ||
1781 | int rc; | ||
1782 | |||
1783 | rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); | ||
1784 | seg1->mr_chunk.rl_mr = NULL; | ||
1785 | while (seg1->mr_nsegs--) | ||
1786 | rpcrdma_unmap_one(ia, seg++); | ||
1787 | if (rc) | ||
1788 | dprintk("RPC: %s: failed ib_dereg_mr," | ||
1789 | " status %i\n", __func__, rc); | ||
1790 | return rc; | ||
1791 | } | ||
1792 | |||
1793 | int | 1667 | int |
1794 | rpcrdma_register_external(struct rpcrdma_mr_seg *seg, | 1668 | rpcrdma_register_external(struct rpcrdma_mr_seg *seg, |
1795 | int nsegs, int writing, struct rpcrdma_xprt *r_xprt) | 1669 | int nsegs, int writing, struct rpcrdma_xprt *r_xprt) |
@@ -1819,16 +1693,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, | |||
1819 | rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); | 1693 | rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); |
1820 | break; | 1694 | break; |
1821 | 1695 | ||
1822 | /* Registration using memory windows */ | ||
1823 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
1824 | case RPCRDMA_MEMWINDOWS: | ||
1825 | rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); | ||
1826 | break; | ||
1827 | |||
1828 | /* Default registration each time */ | ||
1829 | default: | 1696 | default: |
1830 | rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); | 1697 | return -1; |
1831 | break; | ||
1832 | } | 1698 | } |
1833 | if (rc) | 1699 | if (rc) |
1834 | return -1; | 1700 | return -1; |
@@ -1838,7 +1704,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, | |||
1838 | 1704 | ||
1839 | int | 1705 | int |
1840 | rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, | 1706 | rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, |
1841 | struct rpcrdma_xprt *r_xprt, void *r) | 1707 | struct rpcrdma_xprt *r_xprt) |
1842 | { | 1708 | { |
1843 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | 1709 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
1844 | int nsegs = seg->mr_nsegs, rc; | 1710 | int nsegs = seg->mr_nsegs, rc; |
@@ -1847,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, | |||
1847 | 1713 | ||
1848 | #if RPCRDMA_PERSISTENT_REGISTRATION | 1714 | #if RPCRDMA_PERSISTENT_REGISTRATION |
1849 | case RPCRDMA_ALLPHYSICAL: | 1715 | case RPCRDMA_ALLPHYSICAL: |
1850 | BUG_ON(nsegs != 1); | ||
1851 | rpcrdma_unmap_one(ia, seg); | 1716 | rpcrdma_unmap_one(ia, seg); |
1852 | rc = 0; | ||
1853 | break; | 1717 | break; |
1854 | #endif | 1718 | #endif |
1855 | 1719 | ||
@@ -1861,21 +1725,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, | |||
1861 | rc = rpcrdma_deregister_fmr_external(seg, ia); | 1725 | rc = rpcrdma_deregister_fmr_external(seg, ia); |
1862 | break; | 1726 | break; |
1863 | 1727 | ||
1864 | case RPCRDMA_MEMWINDOWS_ASYNC: | ||
1865 | case RPCRDMA_MEMWINDOWS: | ||
1866 | rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); | ||
1867 | break; | ||
1868 | |||
1869 | default: | 1728 | default: |
1870 | rc = rpcrdma_deregister_default_external(seg, ia); | ||
1871 | break; | 1729 | break; |
1872 | } | 1730 | } |
1873 | if (r) { | ||
1874 | struct rpcrdma_rep *rep = r; | ||
1875 | void (*func)(struct rpcrdma_rep *) = rep->rr_func; | ||
1876 | rep->rr_func = NULL; | ||
1877 | func(rep); /* dereg done, callback now */ | ||
1878 | } | ||
1879 | return nsegs; | 1731 | return nsegs; |
1880 | } | 1732 | } |
1881 | 1733 | ||
@@ -1950,7 +1802,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, | |||
1950 | ib_dma_sync_single_for_cpu(ia->ri_id->device, | 1802 | ib_dma_sync_single_for_cpu(ia->ri_id->device, |
1951 | rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); | 1803 | rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); |
1952 | 1804 | ||
1953 | DECR_CQCOUNT(ep); | ||
1954 | rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); | 1805 | rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); |
1955 | 1806 | ||
1956 | if (rc) | 1807 | if (rc) |
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445dc1d1a..89e7cd479705 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/wait.h> /* wait_queue_head_t, etc */ | 43 | #include <linux/wait.h> /* wait_queue_head_t, etc */ |
44 | #include <linux/spinlock.h> /* spinlock_t, etc */ | 44 | #include <linux/spinlock.h> /* spinlock_t, etc */ |
45 | #include <linux/atomic.h> /* atomic_t, etc */ | 45 | #include <linux/atomic.h> /* atomic_t, etc */ |
46 | #include <linux/workqueue.h> /* struct work_struct */ | ||
46 | 47 | ||
47 | #include <rdma/rdma_cm.h> /* RDMA connection api */ | 48 | #include <rdma/rdma_cm.h> /* RDMA connection api */ |
48 | #include <rdma/ib_verbs.h> /* RDMA verbs api */ | 49 | #include <rdma/ib_verbs.h> /* RDMA verbs api */ |
@@ -66,18 +67,21 @@ struct rpcrdma_ia { | |||
66 | struct completion ri_done; | 67 | struct completion ri_done; |
67 | int ri_async_rc; | 68 | int ri_async_rc; |
68 | enum rpcrdma_memreg ri_memreg_strategy; | 69 | enum rpcrdma_memreg ri_memreg_strategy; |
70 | unsigned int ri_max_frmr_depth; | ||
69 | }; | 71 | }; |
70 | 72 | ||
71 | /* | 73 | /* |
72 | * RDMA Endpoint -- one per transport instance | 74 | * RDMA Endpoint -- one per transport instance |
73 | */ | 75 | */ |
74 | 76 | ||
77 | #define RPCRDMA_WC_BUDGET (128) | ||
78 | #define RPCRDMA_POLLSIZE (16) | ||
79 | |||
75 | struct rpcrdma_ep { | 80 | struct rpcrdma_ep { |
76 | atomic_t rep_cqcount; | 81 | atomic_t rep_cqcount; |
77 | int rep_cqinit; | 82 | int rep_cqinit; |
78 | int rep_connected; | 83 | int rep_connected; |
79 | struct rpcrdma_ia *rep_ia; | 84 | struct rpcrdma_ia *rep_ia; |
80 | struct ib_cq *rep_cq; | ||
81 | struct ib_qp_init_attr rep_attr; | 85 | struct ib_qp_init_attr rep_attr; |
82 | wait_queue_head_t rep_connect_wait; | 86 | wait_queue_head_t rep_connect_wait; |
83 | struct ib_sge rep_pad; /* holds zeroed pad */ | 87 | struct ib_sge rep_pad; /* holds zeroed pad */ |
@@ -86,6 +90,9 @@ struct rpcrdma_ep { | |||
86 | struct rpc_xprt *rep_xprt; /* for rep_func */ | 90 | struct rpc_xprt *rep_xprt; /* for rep_func */ |
87 | struct rdma_conn_param rep_remote_cma; | 91 | struct rdma_conn_param rep_remote_cma; |
88 | struct sockaddr_storage rep_remote_addr; | 92 | struct sockaddr_storage rep_remote_addr; |
93 | struct delayed_work rep_connect_worker; | ||
94 | struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; | ||
95 | struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; | ||
89 | }; | 96 | }; |
90 | 97 | ||
91 | #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) | 98 | #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) |
@@ -124,7 +131,6 @@ struct rpcrdma_rep { | |||
124 | struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ | 131 | struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ |
125 | void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ | 132 | void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ |
126 | struct list_head rr_list; /* tasklet list */ | 133 | struct list_head rr_list; /* tasklet list */ |
127 | wait_queue_head_t rr_unbind; /* optional unbind wait */ | ||
128 | struct ib_sge rr_iov; /* for posting */ | 134 | struct ib_sge rr_iov; /* for posting */ |
129 | struct ib_mr *rr_handle; /* handle for mem in rr_iov */ | 135 | struct ib_mr *rr_handle; /* handle for mem in rr_iov */ |
130 | char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ | 136 | char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ |
@@ -159,7 +165,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ | |||
159 | struct ib_mr *rl_mr; /* if registered directly */ | 165 | struct ib_mr *rl_mr; /* if registered directly */ |
160 | struct rpcrdma_mw { /* if registered from region */ | 166 | struct rpcrdma_mw { /* if registered from region */ |
161 | union { | 167 | union { |
162 | struct ib_mw *mw; | ||
163 | struct ib_fmr *fmr; | 168 | struct ib_fmr *fmr; |
164 | struct { | 169 | struct { |
165 | struct ib_fast_reg_page_list *fr_pgl; | 170 | struct ib_fast_reg_page_list *fr_pgl; |
@@ -207,7 +212,6 @@ struct rpcrdma_req { | |||
207 | struct rpcrdma_buffer { | 212 | struct rpcrdma_buffer { |
208 | spinlock_t rb_lock; /* protects indexes */ | 213 | spinlock_t rb_lock; /* protects indexes */ |
209 | atomic_t rb_credits; /* most recent server credits */ | 214 | atomic_t rb_credits; /* most recent server credits */ |
210 | unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ | ||
211 | int rb_max_requests;/* client max requests */ | 215 | int rb_max_requests;/* client max requests */ |
212 | struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ | 216 | struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ |
213 | int rb_send_index; | 217 | int rb_send_index; |
@@ -300,7 +304,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); | |||
300 | */ | 304 | */ |
301 | int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, | 305 | int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, |
302 | struct rpcrdma_create_data_internal *); | 306 | struct rpcrdma_create_data_internal *); |
303 | int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); | 307 | void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); |
304 | int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); | 308 | int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); |
305 | int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); | 309 | int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); |
306 | 310 | ||
@@ -330,11 +334,12 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *, | |||
330 | int rpcrdma_register_external(struct rpcrdma_mr_seg *, | 334 | int rpcrdma_register_external(struct rpcrdma_mr_seg *, |
331 | int, int, struct rpcrdma_xprt *); | 335 | int, int, struct rpcrdma_xprt *); |
332 | int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, | 336 | int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, |
333 | struct rpcrdma_xprt *, void *); | 337 | struct rpcrdma_xprt *); |
334 | 338 | ||
335 | /* | 339 | /* |
336 | * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c | 340 | * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c |
337 | */ | 341 | */ |
342 | void rpcrdma_connect_worker(struct work_struct *); | ||
338 | void rpcrdma_conn_func(struct rpcrdma_ep *); | 343 | void rpcrdma_conn_func(struct rpcrdma_ep *); |
339 | void rpcrdma_reply_handler(struct rpcrdma_rep *); | 344 | void rpcrdma_reply_handler(struct rpcrdma_rep *); |
340 | 345 | ||