aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2015-08-03 13:03:39 -0400
committerAnna Schumaker <Anna.Schumaker@Netapp.com>2015-08-05 16:21:27 -0400
commitb3221d6a53c44cd572a3a400abdd1e2a24bea587 (patch)
tree52a48a8a88cd0b9c1a5b92f5797421f1aa5e642c
parentd1ed857e5707e073973cfb1b8df801053a356518 (diff)
xprtrdma: Remove logic that constructs RDMA_MSGP type calls
RDMA_MSGP type calls insert a zero pad in the middle of the RPC message to align the RPC request's data payload to the server's alignment preferences. A server can then "page flip" the payload into place to avoid a data copy in certain circumstances. However: 1. The client has to have a priori knowledge of the server's preferred alignment 2. Requests eligible for RDMA_MSGP are requests that are small enough to have been sent inline, and convey a data payload at the _end_ of the RPC message Today 1. is done with a sysctl, and is a global setting that is copied during mount. Linux does not support CCP to query the server's preferences (RFC 5666, Section 6). A small-ish NFSv3 WRITE might use RDMA_MSGP, but no NFSv4 compound fits bullet 2. Thus the Linux client currently leaves RDMA_MSGP disabled. The Linux server handles RDMA_MSGP, but does not use any special page flipping, so it confers no benefit. Clean up the marshaling code by removing the logic that constructs RDMA_MSGP type calls. This also reduces the maximum send iovec size from four to just two elements. /proc/sys/sunrpc/rdma_inline_write_padding is a kernel API, and thus is left in place. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Tested-by: Devesh Sharma <devesh.sharma@avagotech.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c92
-rw-r--r--net/sunrpc/xprtrdma/verbs.c47
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h19
3 files changed, 51 insertions, 107 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 84ea37daef36..8e9c56429ada 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -297,8 +297,7 @@ out:
297 * pre-registered memory buffer for this request. For small amounts 297 * pre-registered memory buffer for this request. For small amounts
298 * of data, this is efficient. The cutoff value is tunable. 298 * of data, this is efficient. The cutoff value is tunable.
299 */ 299 */
300static int 300static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
301rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
302{ 301{
303 int i, npages, curlen; 302 int i, npages, curlen;
304 int copy_len; 303 int copy_len;
@@ -310,16 +309,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
310 destp = rqst->rq_svec[0].iov_base; 309 destp = rqst->rq_svec[0].iov_base;
311 curlen = rqst->rq_svec[0].iov_len; 310 curlen = rqst->rq_svec[0].iov_len;
312 destp += curlen; 311 destp += curlen;
313 /*
314 * Do optional padding where it makes sense. Alignment of write
315 * payload can help the server, if our setting is accurate.
316 */
317 pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
318 if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
319 pad = 0; /* don't pad this request */
320 312
321 dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", 313 dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
322 __func__, pad, destp, rqst->rq_slen, curlen); 314 __func__, destp, rqst->rq_slen, curlen);
323 315
324 copy_len = rqst->rq_snd_buf.page_len; 316 copy_len = rqst->rq_snd_buf.page_len;
325 317
@@ -355,7 +347,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
355 page_base = 0; 347 page_base = 0;
356 } 348 }
357 /* header now contains entire send message */ 349 /* header now contains entire send message */
358 return pad;
359} 350}
360 351
361/* 352/*
@@ -380,7 +371,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
380 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 371 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
381 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 372 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
382 char *base; 373 char *base;
383 size_t rpclen, padlen; 374 size_t rpclen;
384 ssize_t hdrlen; 375 ssize_t hdrlen;
385 enum rpcrdma_chunktype rtype, wtype; 376 enum rpcrdma_chunktype rtype, wtype;
386 struct rpcrdma_msg *headerp; 377 struct rpcrdma_msg *headerp;
@@ -458,7 +449,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
458 } 449 }
459 450
460 hdrlen = RPCRDMA_HDRLEN_MIN; 451 hdrlen = RPCRDMA_HDRLEN_MIN;
461 padlen = 0;
462 452
463 /* 453 /*
464 * Pull up any extra send data into the preregistered buffer. 454 * Pull up any extra send data into the preregistered buffer.
@@ -467,43 +457,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
467 */ 457 */
468 if (rtype == rpcrdma_noch) { 458 if (rtype == rpcrdma_noch) {
469 459
470 padlen = rpcrdma_inline_pullup(rqst, 460 rpcrdma_inline_pullup(rqst);
471 RPCRDMA_INLINE_PAD_VALUE(rqst)); 461
472 462 headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
473 if (padlen) { 463 headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
474 headerp->rm_type = rdma_msgp; 464 headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
475 headerp->rm_body.rm_padded.rm_align = 465 /* new length after pullup */
476 cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); 466 rpclen = rqst->rq_svec[0].iov_len;
477 headerp->rm_body.rm_padded.rm_thresh = 467 /* Currently we try to not actually use read inline.
478 cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); 468 * Reply chunks have the desirable property that
479 headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; 469 * they land, packed, directly in the target buffers
480 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 470 * without headers, so they require no fixup. The
481 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 471 * additional RDMA Write op sends the same amount
482 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 472 * of data, streams on-the-wire and adds no overhead
483 if (wtype != rpcrdma_noch) { 473 * on receive. Therefore, we request a reply chunk
484 dprintk("RPC: %s: invalid chunk list\n", 474 * for non-writes wherever feasible and efficient.
485 __func__); 475 */
486 return -EIO; 476 if (wtype == rpcrdma_noch)
487 } 477 wtype = rpcrdma_replych;
488 } else {
489 headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
490 headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
491 headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
492 /* new length after pullup */
493 rpclen = rqst->rq_svec[0].iov_len;
494 /*
495 * Currently we try to not actually use read inline.
496 * Reply chunks have the desirable property that
497 * they land, packed, directly in the target buffers
498 * without headers, so they require no fixup. The
499 * additional RDMA Write op sends the same amount
500 * of data, streams on-the-wire and adds no overhead
501 * on receive. Therefore, we request a reply chunk
502 * for non-writes wherever feasible and efficient.
503 */
504 if (wtype == rpcrdma_noch)
505 wtype = rpcrdma_replych;
506 }
507 } 478 }
508 479
509 if (rtype != rpcrdma_noch) { 480 if (rtype != rpcrdma_noch) {
@@ -518,9 +489,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
518 if (hdrlen < 0) 489 if (hdrlen < 0)
519 return hdrlen; 490 return hdrlen;
520 491
521 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 492 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
522 " headerp 0x%p base 0x%p lkey 0x%x\n", 493 " headerp 0x%p base 0x%p lkey 0x%x\n",
523 __func__, transfertypes[wtype], hdrlen, rpclen, padlen, 494 __func__, transfertypes[wtype], hdrlen, rpclen,
524 headerp, base, rdmab_lkey(req->rl_rdmabuf)); 495 headerp, base, rdmab_lkey(req->rl_rdmabuf));
525 496
526 /* 497 /*
@@ -539,21 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
539 req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); 510 req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
540 511
541 req->rl_niovs = 2; 512 req->rl_niovs = 2;
542
543 if (padlen) {
544 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
545
546 req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
547 req->rl_send_iov[2].length = padlen;
548 req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
549
550 req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
551 req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
552 req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
553
554 req->rl_niovs = 4;
555 }
556
557 return 0; 513 return 0;
558} 514}
559 515
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8516d9894599..b4d4f6300fbc 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -605,6 +605,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
605 struct ib_cq_init_attr cq_attr = {}; 605 struct ib_cq_init_attr cq_attr = {};
606 int rc, err; 606 int rc, err;
607 607
608 if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
609 dprintk("RPC: %s: insufficient sge's available\n",
610 __func__);
611 return -ENOMEM;
612 }
613
608 /* check provider's send/recv wr limits */ 614 /* check provider's send/recv wr limits */
609 if (cdata->max_requests > devattr->max_qp_wr) 615 if (cdata->max_requests > devattr->max_qp_wr)
610 cdata->max_requests = devattr->max_qp_wr; 616 cdata->max_requests = devattr->max_qp_wr;
@@ -617,23 +623,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
617 if (rc) 623 if (rc)
618 return rc; 624 return rc;
619 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 625 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
620 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 626 ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
621 ep->rep_attr.cap.max_recv_sge = 1; 627 ep->rep_attr.cap.max_recv_sge = 1;
622 ep->rep_attr.cap.max_inline_data = 0; 628 ep->rep_attr.cap.max_inline_data = 0;
623 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 629 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
624 ep->rep_attr.qp_type = IB_QPT_RC; 630 ep->rep_attr.qp_type = IB_QPT_RC;
625 ep->rep_attr.port_num = ~0; 631 ep->rep_attr.port_num = ~0;
626 632
627 if (cdata->padding) {
628 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
629 GFP_KERNEL);
630 if (IS_ERR(ep->rep_padbuf)) {
631 rc = PTR_ERR(ep->rep_padbuf);
632 goto out0;
633 }
634 } else
635 ep->rep_padbuf = NULL;
636
637 dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 633 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
638 "iovs: send %d recv %d\n", 634 "iovs: send %d recv %d\n",
639 __func__, 635 __func__,
@@ -716,8 +712,6 @@ out2:
716 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 712 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
717 __func__, err); 713 __func__, err);
718out1: 714out1:
719 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
720out0:
721 if (ia->ri_dma_mr) 715 if (ia->ri_dma_mr)
722 ib_dereg_mr(ia->ri_dma_mr); 716 ib_dereg_mr(ia->ri_dma_mr);
723 return rc; 717 return rc;
@@ -746,8 +740,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
746 ia->ri_id->qp = NULL; 740 ia->ri_id->qp = NULL;
747 } 741 }
748 742
749 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
750
751 rpcrdma_clean_cq(ep->rep_attr.recv_cq); 743 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
752 rc = ib_destroy_cq(ep->rep_attr.recv_cq); 744 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
753 if (rc) 745 if (rc)
@@ -1279,9 +1271,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1279 struct rpcrdma_ep *ep, 1271 struct rpcrdma_ep *ep,
1280 struct rpcrdma_req *req) 1272 struct rpcrdma_req *req)
1281{ 1273{
1274 struct ib_device *device = ia->ri_device;
1282 struct ib_send_wr send_wr, *send_wr_fail; 1275 struct ib_send_wr send_wr, *send_wr_fail;
1283 struct rpcrdma_rep *rep = req->rl_reply; 1276 struct rpcrdma_rep *rep = req->rl_reply;
1284 int rc; 1277 struct ib_sge *iov = req->rl_send_iov;
1278 int i, rc;
1285 1279
1286 if (rep) { 1280 if (rep) {
1287 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1281 rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1292,22 +1286,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1292 1286
1293 send_wr.next = NULL; 1287 send_wr.next = NULL;
1294 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; 1288 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
1295 send_wr.sg_list = req->rl_send_iov; 1289 send_wr.sg_list = iov;
1296 send_wr.num_sge = req->rl_niovs; 1290 send_wr.num_sge = req->rl_niovs;
1297 send_wr.opcode = IB_WR_SEND; 1291 send_wr.opcode = IB_WR_SEND;
1298 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ 1292
1299 ib_dma_sync_single_for_device(ia->ri_device, 1293 for (i = 0; i < send_wr.num_sge; i++)
1300 req->rl_send_iov[3].addr, 1294 ib_dma_sync_single_for_device(device, iov[i].addr,
1301 req->rl_send_iov[3].length, 1295 iov[i].length, DMA_TO_DEVICE);
1302 DMA_TO_DEVICE); 1296 dprintk("RPC: %s: posting %d s/g entries\n",
1303 ib_dma_sync_single_for_device(ia->ri_device, 1297 __func__, send_wr.num_sge);
1304 req->rl_send_iov[1].addr,
1305 req->rl_send_iov[1].length,
1306 DMA_TO_DEVICE);
1307 ib_dma_sync_single_for_device(ia->ri_device,
1308 req->rl_send_iov[0].addr,
1309 req->rl_send_iov[0].length,
1310 DMA_TO_DEVICE);
1311 1298
1312 if (DECR_CQCOUNT(ep) > 0) 1299 if (DECR_CQCOUNT(ep) > 0)
1313 send_wr.send_flags = 0; 1300 send_wr.send_flags = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 82190118b8d9..8422c09043b0 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -88,7 +88,6 @@ struct rpcrdma_ep {
88 int rep_connected; 88 int rep_connected;
89 struct ib_qp_init_attr rep_attr; 89 struct ib_qp_init_attr rep_attr;
90 wait_queue_head_t rep_connect_wait; 90 wait_queue_head_t rep_connect_wait;
91 struct rpcrdma_regbuf *rep_padbuf;
92 struct rdma_conn_param rep_remote_cma; 91 struct rdma_conn_param rep_remote_cma;
93 struct sockaddr_storage rep_remote_addr; 92 struct sockaddr_storage rep_remote_addr;
94 struct delayed_work rep_connect_worker; 93 struct delayed_work rep_connect_worker;
@@ -255,16 +254,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
255 char *mr_offset; /* kva if no page, else offset */ 254 char *mr_offset; /* kva if no page, else offset */
256}; 255};
257 256
257#define RPCRDMA_MAX_IOVS (2)
258
258struct rpcrdma_req { 259struct rpcrdma_req {
259 unsigned int rl_niovs; /* 0, 2 or 4 */ 260 unsigned int rl_niovs;
260 unsigned int rl_nchunks; /* non-zero if chunks */ 261 unsigned int rl_nchunks;
261 unsigned int rl_connect_cookie; /* retry detection */ 262 unsigned int rl_connect_cookie;
262 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 263 struct rpcrdma_buffer *rl_buffer;
263 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 264 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
264 struct ib_sge rl_send_iov[4]; /* for active requests */ 265 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
265 struct rpcrdma_regbuf *rl_rdmabuf; 266 struct rpcrdma_regbuf *rl_rdmabuf;
266 struct rpcrdma_regbuf *rl_sendbuf; 267 struct rpcrdma_regbuf *rl_sendbuf;
267 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 268 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
268}; 269};
269 270
270static inline struct rpcrdma_req * 271static inline struct rpcrdma_req *