diff options
author | Chuck Lever <chuck.lever@oracle.com> | 2015-08-03 13:03:39 -0400 |
---|---|---|
committer | Anna Schumaker <Anna.Schumaker@Netapp.com> | 2015-08-05 16:21:27 -0400 |
commit | b3221d6a53c44cd572a3a400abdd1e2a24bea587 (patch) | |
tree | 52a48a8a88cd0b9c1a5b92f5797421f1aa5e642c | |
parent | d1ed857e5707e073973cfb1b8df801053a356518 (diff) |
xprtrdma: Remove logic that constructs RDMA_MSGP type calls
RDMA_MSGP type calls insert a zero pad in the middle of the RPC
message to align the RPC request's data payload to the server's
alignment preferences. A server can then "page flip" the payload
into place to avoid a data copy in certain circumstances. However:
1. The client has to have a priori knowledge of the server's
preferred alignment
2. Requests eligible for RDMA_MSGP are requests that are small
enough to have been sent inline, and convey a data payload
at the _end_ of the RPC message
Today 1. is done with a sysctl, and is a global setting that is
copied during mount. Linux does not support CCP to query the
server's preferences (RFC 5666, Section 6).
A small-ish NFSv3 WRITE might use RDMA_MSGP, but no NFSv4
compound fits bullet 2.
Thus the Linux client currently leaves RDMA_MSGP disabled. The
Linux server handles RDMA_MSGP, but does not use any special
page flipping, so it confers no benefit.
Clean up the marshaling code by removing the logic that constructs
RDMA_MSGP type calls. This also reduces the maximum send iovec size
from four to just two elements.
/proc/sys/sunrpc/rdma_inline_write_padding is a kernel API, and
thus is left in place.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Devesh Sharma <devesh.sharma@avagotech.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 92 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 47 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 19 |
3 files changed, 51 insertions, 107 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 84ea37daef36..8e9c56429ada 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c | |||
@@ -297,8 +297,7 @@ out: | |||
297 | * pre-registered memory buffer for this request. For small amounts | 297 | * pre-registered memory buffer for this request. For small amounts |
298 | * of data, this is efficient. The cutoff value is tunable. | 298 | * of data, this is efficient. The cutoff value is tunable. |
299 | */ | 299 | */ |
300 | static int | 300 | static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) |
301 | rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) | ||
302 | { | 301 | { |
303 | int i, npages, curlen; | 302 | int i, npages, curlen; |
304 | int copy_len; | 303 | int copy_len; |
@@ -310,16 +309,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) | |||
310 | destp = rqst->rq_svec[0].iov_base; | 309 | destp = rqst->rq_svec[0].iov_base; |
311 | curlen = rqst->rq_svec[0].iov_len; | 310 | curlen = rqst->rq_svec[0].iov_len; |
312 | destp += curlen; | 311 | destp += curlen; |
313 | /* | ||
314 | * Do optional padding where it makes sense. Alignment of write | ||
315 | * payload can help the server, if our setting is accurate. | ||
316 | */ | ||
317 | pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); | ||
318 | if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) | ||
319 | pad = 0; /* don't pad this request */ | ||
320 | 312 | ||
321 | dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", | 313 | dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", |
322 | __func__, pad, destp, rqst->rq_slen, curlen); | 314 | __func__, destp, rqst->rq_slen, curlen); |
323 | 315 | ||
324 | copy_len = rqst->rq_snd_buf.page_len; | 316 | copy_len = rqst->rq_snd_buf.page_len; |
325 | 317 | ||
@@ -355,7 +347,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) | |||
355 | page_base = 0; | 347 | page_base = 0; |
356 | } | 348 | } |
357 | /* header now contains entire send message */ | 349 | /* header now contains entire send message */ |
358 | return pad; | ||
359 | } | 350 | } |
360 | 351 | ||
361 | /* | 352 | /* |
@@ -380,7 +371,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
380 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | 371 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
381 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); | 372 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); |
382 | char *base; | 373 | char *base; |
383 | size_t rpclen, padlen; | 374 | size_t rpclen; |
384 | ssize_t hdrlen; | 375 | ssize_t hdrlen; |
385 | enum rpcrdma_chunktype rtype, wtype; | 376 | enum rpcrdma_chunktype rtype, wtype; |
386 | struct rpcrdma_msg *headerp; | 377 | struct rpcrdma_msg *headerp; |
@@ -458,7 +449,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
458 | } | 449 | } |
459 | 450 | ||
460 | hdrlen = RPCRDMA_HDRLEN_MIN; | 451 | hdrlen = RPCRDMA_HDRLEN_MIN; |
461 | padlen = 0; | ||
462 | 452 | ||
463 | /* | 453 | /* |
464 | * Pull up any extra send data into the preregistered buffer. | 454 | * Pull up any extra send data into the preregistered buffer. |
@@ -467,43 +457,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
467 | */ | 457 | */ |
468 | if (rtype == rpcrdma_noch) { | 458 | if (rtype == rpcrdma_noch) { |
469 | 459 | ||
470 | padlen = rpcrdma_inline_pullup(rqst, | 460 | rpcrdma_inline_pullup(rqst); |
471 | RPCRDMA_INLINE_PAD_VALUE(rqst)); | 461 | |
472 | 462 | headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; | |
473 | if (padlen) { | 463 | headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; |
474 | headerp->rm_type = rdma_msgp; | 464 | headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; |
475 | headerp->rm_body.rm_padded.rm_align = | 465 | /* new length after pullup */ |
476 | cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); | 466 | rpclen = rqst->rq_svec[0].iov_len; |
477 | headerp->rm_body.rm_padded.rm_thresh = | 467 | /* Currently we try to not actually use read inline. |
478 | cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); | 468 | * Reply chunks have the desirable property that |
479 | headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; | 469 | * they land, packed, directly in the target buffers |
480 | headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; | 470 | * without headers, so they require no fixup. The |
481 | headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; | 471 | * additional RDMA Write op sends the same amount |
482 | hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ | 472 | * of data, streams on-the-wire and adds no overhead |
483 | if (wtype != rpcrdma_noch) { | 473 | * on receive. Therefore, we request a reply chunk |
484 | dprintk("RPC: %s: invalid chunk list\n", | 474 | * for non-writes wherever feasible and efficient. |
485 | __func__); | 475 | */ |
486 | return -EIO; | 476 | if (wtype == rpcrdma_noch) |
487 | } | 477 | wtype = rpcrdma_replych; |
488 | } else { | ||
489 | headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; | ||
490 | headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; | ||
491 | headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; | ||
492 | /* new length after pullup */ | ||
493 | rpclen = rqst->rq_svec[0].iov_len; | ||
494 | /* | ||
495 | * Currently we try to not actually use read inline. | ||
496 | * Reply chunks have the desirable property that | ||
497 | * they land, packed, directly in the target buffers | ||
498 | * without headers, so they require no fixup. The | ||
499 | * additional RDMA Write op sends the same amount | ||
500 | * of data, streams on-the-wire and adds no overhead | ||
501 | * on receive. Therefore, we request a reply chunk | ||
502 | * for non-writes wherever feasible and efficient. | ||
503 | */ | ||
504 | if (wtype == rpcrdma_noch) | ||
505 | wtype = rpcrdma_replych; | ||
506 | } | ||
507 | } | 478 | } |
508 | 479 | ||
509 | if (rtype != rpcrdma_noch) { | 480 | if (rtype != rpcrdma_noch) { |
@@ -518,9 +489,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
518 | if (hdrlen < 0) | 489 | if (hdrlen < 0) |
519 | return hdrlen; | 490 | return hdrlen; |
520 | 491 | ||
521 | dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" | 492 | dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" |
522 | " headerp 0x%p base 0x%p lkey 0x%x\n", | 493 | " headerp 0x%p base 0x%p lkey 0x%x\n", |
523 | __func__, transfertypes[wtype], hdrlen, rpclen, padlen, | 494 | __func__, transfertypes[wtype], hdrlen, rpclen, |
524 | headerp, base, rdmab_lkey(req->rl_rdmabuf)); | 495 | headerp, base, rdmab_lkey(req->rl_rdmabuf)); |
525 | 496 | ||
526 | /* | 497 | /* |
@@ -539,21 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
539 | req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); | 510 | req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); |
540 | 511 | ||
541 | req->rl_niovs = 2; | 512 | req->rl_niovs = 2; |
542 | |||
543 | if (padlen) { | ||
544 | struct rpcrdma_ep *ep = &r_xprt->rx_ep; | ||
545 | |||
546 | req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf); | ||
547 | req->rl_send_iov[2].length = padlen; | ||
548 | req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf); | ||
549 | |||
550 | req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; | ||
551 | req->rl_send_iov[3].length = rqst->rq_slen - rpclen; | ||
552 | req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf); | ||
553 | |||
554 | req->rl_niovs = 4; | ||
555 | } | ||
556 | |||
557 | return 0; | 513 | return 0; |
558 | } | 514 | } |
559 | 515 | ||
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8516d9894599..b4d4f6300fbc 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c | |||
@@ -605,6 +605,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
605 | struct ib_cq_init_attr cq_attr = {}; | 605 | struct ib_cq_init_attr cq_attr = {}; |
606 | int rc, err; | 606 | int rc, err; |
607 | 607 | ||
608 | if (devattr->max_sge < RPCRDMA_MAX_IOVS) { | ||
609 | dprintk("RPC: %s: insufficient sge's available\n", | ||
610 | __func__); | ||
611 | return -ENOMEM; | ||
612 | } | ||
613 | |||
608 | /* check provider's send/recv wr limits */ | 614 | /* check provider's send/recv wr limits */ |
609 | if (cdata->max_requests > devattr->max_qp_wr) | 615 | if (cdata->max_requests > devattr->max_qp_wr) |
610 | cdata->max_requests = devattr->max_qp_wr; | 616 | cdata->max_requests = devattr->max_qp_wr; |
@@ -617,23 +623,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
617 | if (rc) | 623 | if (rc) |
618 | return rc; | 624 | return rc; |
619 | ep->rep_attr.cap.max_recv_wr = cdata->max_requests; | 625 | ep->rep_attr.cap.max_recv_wr = cdata->max_requests; |
620 | ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); | 626 | ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; |
621 | ep->rep_attr.cap.max_recv_sge = 1; | 627 | ep->rep_attr.cap.max_recv_sge = 1; |
622 | ep->rep_attr.cap.max_inline_data = 0; | 628 | ep->rep_attr.cap.max_inline_data = 0; |
623 | ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; | 629 | ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; |
624 | ep->rep_attr.qp_type = IB_QPT_RC; | 630 | ep->rep_attr.qp_type = IB_QPT_RC; |
625 | ep->rep_attr.port_num = ~0; | 631 | ep->rep_attr.port_num = ~0; |
626 | 632 | ||
627 | if (cdata->padding) { | ||
628 | ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, | ||
629 | GFP_KERNEL); | ||
630 | if (IS_ERR(ep->rep_padbuf)) { | ||
631 | rc = PTR_ERR(ep->rep_padbuf); | ||
632 | goto out0; | ||
633 | } | ||
634 | } else | ||
635 | ep->rep_padbuf = NULL; | ||
636 | |||
637 | dprintk("RPC: %s: requested max: dtos: send %d recv %d; " | 633 | dprintk("RPC: %s: requested max: dtos: send %d recv %d; " |
638 | "iovs: send %d recv %d\n", | 634 | "iovs: send %d recv %d\n", |
639 | __func__, | 635 | __func__, |
@@ -716,8 +712,6 @@ out2: | |||
716 | dprintk("RPC: %s: ib_destroy_cq returned %i\n", | 712 | dprintk("RPC: %s: ib_destroy_cq returned %i\n", |
717 | __func__, err); | 713 | __func__, err); |
718 | out1: | 714 | out1: |
719 | rpcrdma_free_regbuf(ia, ep->rep_padbuf); | ||
720 | out0: | ||
721 | if (ia->ri_dma_mr) | 715 | if (ia->ri_dma_mr) |
722 | ib_dereg_mr(ia->ri_dma_mr); | 716 | ib_dereg_mr(ia->ri_dma_mr); |
723 | return rc; | 717 | return rc; |
@@ -746,8 +740,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
746 | ia->ri_id->qp = NULL; | 740 | ia->ri_id->qp = NULL; |
747 | } | 741 | } |
748 | 742 | ||
749 | rpcrdma_free_regbuf(ia, ep->rep_padbuf); | ||
750 | |||
751 | rpcrdma_clean_cq(ep->rep_attr.recv_cq); | 743 | rpcrdma_clean_cq(ep->rep_attr.recv_cq); |
752 | rc = ib_destroy_cq(ep->rep_attr.recv_cq); | 744 | rc = ib_destroy_cq(ep->rep_attr.recv_cq); |
753 | if (rc) | 745 | if (rc) |
@@ -1279,9 +1271,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, | |||
1279 | struct rpcrdma_ep *ep, | 1271 | struct rpcrdma_ep *ep, |
1280 | struct rpcrdma_req *req) | 1272 | struct rpcrdma_req *req) |
1281 | { | 1273 | { |
1274 | struct ib_device *device = ia->ri_device; | ||
1282 | struct ib_send_wr send_wr, *send_wr_fail; | 1275 | struct ib_send_wr send_wr, *send_wr_fail; |
1283 | struct rpcrdma_rep *rep = req->rl_reply; | 1276 | struct rpcrdma_rep *rep = req->rl_reply; |
1284 | int rc; | 1277 | struct ib_sge *iov = req->rl_send_iov; |
1278 | int i, rc; | ||
1285 | 1279 | ||
1286 | if (rep) { | 1280 | if (rep) { |
1287 | rc = rpcrdma_ep_post_recv(ia, ep, rep); | 1281 | rc = rpcrdma_ep_post_recv(ia, ep, rep); |
@@ -1292,22 +1286,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, | |||
1292 | 1286 | ||
1293 | send_wr.next = NULL; | 1287 | send_wr.next = NULL; |
1294 | send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; | 1288 | send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; |
1295 | send_wr.sg_list = req->rl_send_iov; | 1289 | send_wr.sg_list = iov; |
1296 | send_wr.num_sge = req->rl_niovs; | 1290 | send_wr.num_sge = req->rl_niovs; |
1297 | send_wr.opcode = IB_WR_SEND; | 1291 | send_wr.opcode = IB_WR_SEND; |
1298 | if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ | 1292 | |
1299 | ib_dma_sync_single_for_device(ia->ri_device, | 1293 | for (i = 0; i < send_wr.num_sge; i++) |
1300 | req->rl_send_iov[3].addr, | 1294 | ib_dma_sync_single_for_device(device, iov[i].addr, |
1301 | req->rl_send_iov[3].length, | 1295 | iov[i].length, DMA_TO_DEVICE); |
1302 | DMA_TO_DEVICE); | 1296 | dprintk("RPC: %s: posting %d s/g entries\n", |
1303 | ib_dma_sync_single_for_device(ia->ri_device, | 1297 | __func__, send_wr.num_sge); |
1304 | req->rl_send_iov[1].addr, | ||
1305 | req->rl_send_iov[1].length, | ||
1306 | DMA_TO_DEVICE); | ||
1307 | ib_dma_sync_single_for_device(ia->ri_device, | ||
1308 | req->rl_send_iov[0].addr, | ||
1309 | req->rl_send_iov[0].length, | ||
1310 | DMA_TO_DEVICE); | ||
1311 | 1298 | ||
1312 | if (DECR_CQCOUNT(ep) > 0) | 1299 | if (DECR_CQCOUNT(ep) > 0) |
1313 | send_wr.send_flags = 0; | 1300 | send_wr.send_flags = 0; |
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 82190118b8d9..8422c09043b0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h | |||
@@ -88,7 +88,6 @@ struct rpcrdma_ep { | |||
88 | int rep_connected; | 88 | int rep_connected; |
89 | struct ib_qp_init_attr rep_attr; | 89 | struct ib_qp_init_attr rep_attr; |
90 | wait_queue_head_t rep_connect_wait; | 90 | wait_queue_head_t rep_connect_wait; |
91 | struct rpcrdma_regbuf *rep_padbuf; | ||
92 | struct rdma_conn_param rep_remote_cma; | 91 | struct rdma_conn_param rep_remote_cma; |
93 | struct sockaddr_storage rep_remote_addr; | 92 | struct sockaddr_storage rep_remote_addr; |
94 | struct delayed_work rep_connect_worker; | 93 | struct delayed_work rep_connect_worker; |
@@ -255,16 +254,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ | |||
255 | char *mr_offset; /* kva if no page, else offset */ | 254 | char *mr_offset; /* kva if no page, else offset */ |
256 | }; | 255 | }; |
257 | 256 | ||
257 | #define RPCRDMA_MAX_IOVS (2) | ||
258 | |||
258 | struct rpcrdma_req { | 259 | struct rpcrdma_req { |
259 | unsigned int rl_niovs; /* 0, 2 or 4 */ | 260 | unsigned int rl_niovs; |
260 | unsigned int rl_nchunks; /* non-zero if chunks */ | 261 | unsigned int rl_nchunks; |
261 | unsigned int rl_connect_cookie; /* retry detection */ | 262 | unsigned int rl_connect_cookie; |
262 | struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ | 263 | struct rpcrdma_buffer *rl_buffer; |
263 | struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ | 264 | struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ |
264 | struct ib_sge rl_send_iov[4]; /* for active requests */ | 265 | struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; |
265 | struct rpcrdma_regbuf *rl_rdmabuf; | 266 | struct rpcrdma_regbuf *rl_rdmabuf; |
266 | struct rpcrdma_regbuf *rl_sendbuf; | 267 | struct rpcrdma_regbuf *rl_sendbuf; |
267 | struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; | 268 | struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; |
268 | }; | 269 | }; |
269 | 270 | ||
270 | static inline struct rpcrdma_req * | 271 | static inline struct rpcrdma_req * |