aboutsummaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2016-09-15 10:55:53 -0400
committerAnna Schumaker <Anna.Schumaker@Netapp.com>2016-09-19 13:08:37 -0400
commit9c40c49f145f8999ecbf81683aeb31d92b61b966 (patch)
tree8a21617f37ce7c8b41208c87df7a77f52705b2f5 /net/sunrpc
parent5a6d1db4556940533f1a5b6521e522f3e46508ed (diff)
xprtrdma: Initialize separate RPC call and reply buffers
RPC-over-RDMA needs to separate its RPC call and reply buffers. o When an RPC Call is sent, rq_snd_buf is DMA mapped for an RDMA Send operation using DMA_TO_DEVICE o If the client expects a large RPC reply, it DMA maps rq_rcv_buf as part of a Reply chunk using DMA_FROM_DEVICE The two mappings are for data movement in opposite directions. DMA-API.txt suggests that if these mappings share a DMA cacheline, bad things can happen. This could occur in the final bytes of rq_snd_buf and the first bytes of rq_rcv_buf if the two buffers happen to share a DMA cacheline. On x86_64 the cacheline size is typically 8 bytes, and RPC call messages are usually much smaller than the send buffer, so this hasn't been a noticeable problem. But the DMA cacheline size can be larger on other platforms. Also, often rq_rcv_buf starts most of the way into a page, thus an additional RDMA segment is needed to map and register the end of that buffer. Try to avoid that scenario to reduce the cost of registering and invalidating Reply chunks. Instead of carrying a single regbuf that covers both rq_snd_buf and rq_rcv_buf, each struct rpcrdma_req now carries one regbuf for rq_snd_buf and one regbuf for rq_rcv_buf. Some incidental changes worth noting: - To clear out some spaghetti, refactor xprt_rdma_allocate. - The value stored in rg_size is the same as the value stored in the iov.length field, so eliminate rg_size Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/xprtrdma/transport.c150
-rw-r--r--net/sunrpc/xprtrdma/verbs.c2
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h6
3 files changed, 99 insertions, 59 deletions
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index d83bffa92dfc..ecdc3ad7dbb6 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -477,6 +477,86 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
477 } 477 }
478} 478}
479 479
480/* Allocate a fixed-size buffer in which to construct and send the
481 * RPC-over-RDMA header for this request.
482 */
483static bool
484rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
485 gfp_t flags)
486{
487 size_t size = r_xprt->rx_data.inline_wsize;
488 struct rpcrdma_regbuf *rb;
489
490 if (req->rl_rdmabuf)
491 return true;
492
493 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
494 if (IS_ERR(rb))
495 return false;
496
497 r_xprt->rx_stats.hardway_register_count += size;
498 req->rl_rdmabuf = rb;
499 return true;
500}
501
502/* RPC/RDMA marshaling may choose to send payload bearing ops inline,
503 * if the resulting Call message is smaller than the inline threshold.
504 * The value of the "rq_callsize" argument accounts for RPC header
505 * requirements, but not for the data payload in these cases.
506 *
507 * See rpcrdma_inline_pullup.
508 */
509static bool
510rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
511 size_t size, gfp_t flags)
512{
513 struct rpcrdma_regbuf *rb;
514 size_t min_size;
515
516 if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
517 return true;
518
519 min_size = max_t(size_t, size, r_xprt->rx_data.inline_wsize);
520 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
521 if (IS_ERR(rb))
522 return false;
523
524 rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
525 r_xprt->rx_stats.hardway_register_count += min_size;
526 req->rl_sendbuf = rb;
527 return true;
528}
529
530/* The rq_rcv_buf is used only if a Reply chunk is necessary.
531 * The decision to use a Reply chunk is made later in
532 * rpcrdma_marshal_req. This buffer is registered at that time.
533 *
534 * Otherwise, the associated RPC Reply arrives in a separate
535 * Receive buffer, arbitrarily chosen by the HCA. The buffer
536 * allocated here for the RPC Reply is not utilized in that
537 * case. See rpcrdma_inline_fixup.
538 *
539 * A regbuf is used here to remember the buffer size.
540 */
541static bool
542rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
543 size_t size, gfp_t flags)
544{
545 struct rpcrdma_regbuf *rb;
546
547 if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
548 return true;
549
550 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
551 if (IS_ERR(rb))
552 return false;
553
554 rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_recvbuf);
555 r_xprt->rx_stats.hardway_register_count += size;
556 req->rl_recvbuf = rb;
557 return true;
558}
559
480/** 560/**
481 * xprt_rdma_allocate - allocate transport resources for an RPC 561 * xprt_rdma_allocate - allocate transport resources for an RPC
482 * @task: RPC task 562 * @task: RPC task
@@ -487,22 +567,18 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
487 * EIO: A permanent error occurred, do not retry 567 * EIO: A permanent error occurred, do not retry
488 * 568 *
489 * The RDMA allocate/free functions need the task structure as a place 569 * The RDMA allocate/free functions need the task structure as a place
490 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv 570 * to hide the struct rpcrdma_req, which is necessary for the actual
491 * sequence. 571 * send/recv sequence.
492 * 572 *
493 * The RPC layer allocates both send and receive buffers in the same call 573 * xprt_rdma_allocate provides buffers that are already mapped for
494 * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). 574 * DMA, and a local DMA lkey is provided for each.
495 * We may register rq_rcv_buf when using reply chunks.
496 */ 575 */
497static int 576static int
498xprt_rdma_allocate(struct rpc_task *task) 577xprt_rdma_allocate(struct rpc_task *task)
499{ 578{
500 struct rpc_rqst *rqst = task->tk_rqstp; 579 struct rpc_rqst *rqst = task->tk_rqstp;
501 size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
502 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 580 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
503 struct rpcrdma_regbuf *rb;
504 struct rpcrdma_req *req; 581 struct rpcrdma_req *req;
505 size_t min_size;
506 gfp_t flags; 582 gfp_t flags;
507 583
508 req = rpcrdma_buffer_get(&r_xprt->rx_buf); 584 req = rpcrdma_buffer_get(&r_xprt->rx_buf);
@@ -513,59 +589,23 @@ xprt_rdma_allocate(struct rpc_task *task)
513 if (RPC_IS_SWAPPER(task)) 589 if (RPC_IS_SWAPPER(task))
514 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; 590 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
515 591
516 if (req->rl_rdmabuf == NULL) 592 if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
517 goto out_rdmabuf; 593 goto out_fail;
518 if (req->rl_sendbuf == NULL) 594 if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
519 goto out_sendbuf; 595 goto out_fail;
520 if (size > req->rl_sendbuf->rg_size) 596 if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
521 goto out_sendbuf; 597 goto out_fail;
598
599 dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
600 task->tk_pid, __func__, rqst->rq_callsize,
601 rqst->rq_rcvsize, req);
522 602
523out:
524 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
525 req->rl_connect_cookie = 0; /* our reserved value */ 603 req->rl_connect_cookie = 0; /* our reserved value */
526 rpcrdma_set_xprtdata(rqst, req); 604 rpcrdma_set_xprtdata(rqst, req);
527 rqst->rq_buffer = req->rl_sendbuf->rg_base; 605 rqst->rq_buffer = req->rl_sendbuf->rg_base;
528 rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; 606 rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
529 return 0; 607 return 0;
530 608
531out_rdmabuf:
532 min_size = r_xprt->rx_data.inline_wsize;
533 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
534 if (IS_ERR(rb))
535 goto out_fail;
536 req->rl_rdmabuf = rb;
537
538out_sendbuf:
539 /* XDR encoding and RPC/RDMA marshaling of this request has not
540 * yet occurred. Thus a lower bound is needed to prevent buffer
541 * overrun during marshaling.
542 *
543 * RPC/RDMA marshaling may choose to send payload bearing ops
544 * inline, if the result is smaller than the inline threshold.
545 * The value of the "size" argument accounts for header
546 * requirements but not for the payload in these cases.
547 *
548 * Likewise, allocate enough space to receive a reply up to the
549 * size of the inline threshold.
550 *
551 * It's unlikely that both the send header and the received
552 * reply will be large, but slush is provided here to allow
553 * flexibility when marshaling.
554 */
555 min_size = r_xprt->rx_data.inline_rsize;
556 min_size += r_xprt->rx_data.inline_wsize;
557 if (size < min_size)
558 size = min_size;
559
560 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
561 if (IS_ERR(rb))
562 goto out_fail;
563
564 r_xprt->rx_stats.hardway_register_count += size;
565 rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
566 req->rl_sendbuf = rb;
567 goto out;
568
569out_fail: 609out_fail:
570 rpcrdma_buffer_put(req); 610 rpcrdma_buffer_put(req);
571 return -ENOMEM; 611 return -ENOMEM;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 93def0bf07af..fc6b4ea8b7ec 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -975,6 +975,7 @@ rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
975void 975void
976rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) 976rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
977{ 977{
978 rpcrdma_free_regbuf(ia, req->rl_recvbuf);
978 rpcrdma_free_regbuf(ia, req->rl_sendbuf); 979 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
979 rpcrdma_free_regbuf(ia, req->rl_rdmabuf); 980 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
980 kfree(req); 981 kfree(req);
@@ -1209,7 +1210,6 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1209 1210
1210 iov->length = size; 1211 iov->length = size;
1211 iov->lkey = ia->ri_pd->local_dma_lkey; 1212 iov->lkey = ia->ri_pd->local_dma_lkey;
1212 rb->rg_size = size;
1213 return rb; 1213 return rb;
1214 1214
1215out_free: 1215out_free:
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 484855eddb85..444f6370d46c 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -112,7 +112,6 @@ struct rpcrdma_ep {
112 */ 112 */
113 113
114struct rpcrdma_regbuf { 114struct rpcrdma_regbuf {
115 size_t rg_size;
116 struct ib_sge rg_iov; 115 struct ib_sge rg_iov;
117 __be32 rg_base[0] __attribute__ ((aligned(256))); 116 __be32 rg_base[0] __attribute__ ((aligned(256)));
118}; 117};
@@ -285,8 +284,9 @@ struct rpcrdma_req {
285 struct rpcrdma_buffer *rl_buffer; 284 struct rpcrdma_buffer *rl_buffer;
286 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 285 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
287 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; 286 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
288 struct rpcrdma_regbuf *rl_rdmabuf; 287 struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
289 struct rpcrdma_regbuf *rl_sendbuf; 288 struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
289 struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
290 290
291 struct ib_cqe rl_cqe; 291 struct ib_cqe rl_cqe;
292 struct list_head rl_all; 292 struct list_head rl_all;