summaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2017-10-20 10:48:36 -0400
committerAnna Schumaker <Anna.Schumaker@Netapp.com>2017-11-17 13:47:57 -0500
commit01bb35c89d90abe6fd1c0be001f84bbdfa7fa7d1 (patch)
tree527e65074e175980f7ec757f189bd77f11f1b694 /net/sunrpc
parent0ba6f37012db2f88f881cd818aec6e1886f61abb (diff)
xprtrdma: RPC completion should wait for Send completion
When an RPC Call includes a file data payload, that payload can come from pages in the page cache, or a user buffer (for direct I/O). If the payload can fit inline, xprtrdma includes it in the Send using a scatter-gather technique. xprtrdma mustn't allow the RPC consumer to re-use the memory where that payload resides before the Send completes. Otherwise, the new contents of that memory would be exposed by an HCA retransmit of the Send operation. So, block RPC completion on Send completion, but only in the case where a separate file data payload is part of the Send. This prevents the reuse of that memory while it is still part of a Send operation without an undue cost to other cases. Waiting is avoided in the common case because typically the Send will have completed long before the RPC Reply arrives. These days, an RPC timeout will trigger a disconnect, which tears down the QP. The disconnect flushes all waiting Sends. This bounds the amount of time the reply handler has to wait for a Send completion. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c26
-rw-r--r--net/sunrpc/xprtrdma/transport.c5
-rw-r--r--net/sunrpc/xprtrdma/verbs.c3
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h4
4 files changed, 34 insertions, 4 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 853dede38900..4fdeaac6ebe6 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -534,6 +534,11 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
534 for (count = sc->sc_unmap_count; count; ++sge, --count) 534 for (count = sc->sc_unmap_count; count; ++sge, --count)
535 ib_dma_unmap_page(ia->ri_device, 535 ib_dma_unmap_page(ia->ri_device,
536 sge->addr, sge->length, DMA_TO_DEVICE); 536 sge->addr, sge->length, DMA_TO_DEVICE);
537
538 if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
539 smp_mb__after_atomic();
540 wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
541 }
537} 542}
538 543
539/* Prepare an SGE for the RPC-over-RDMA transport header. 544/* Prepare an SGE for the RPC-over-RDMA transport header.
@@ -667,6 +672,8 @@ map_tail:
667 672
668out: 673out:
669 sc->sc_wr.num_sge += sge_no; 674 sc->sc_wr.num_sge += sge_no;
675 if (sc->sc_unmap_count)
676 __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
670 return true; 677 return true;
671 678
672out_regbuf: 679out_regbuf:
@@ -704,6 +711,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
704 return -ENOBUFS; 711 return -ENOBUFS;
705 req->rl_sendctx->sc_wr.num_sge = 0; 712 req->rl_sendctx->sc_wr.num_sge = 0;
706 req->rl_sendctx->sc_unmap_count = 0; 713 req->rl_sendctx->sc_unmap_count = 0;
714 req->rl_sendctx->sc_req = req;
715 __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
707 716
708 if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) 717 if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
709 return -EIO; 718 return -EIO;
@@ -1305,6 +1314,20 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
1305 if (!list_empty(&req->rl_registered)) 1314 if (!list_empty(&req->rl_registered))
1306 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, 1315 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
1307 &req->rl_registered); 1316 &req->rl_registered);
1317
1318 /* Ensure that any DMA mapped pages associated with
1319 * the Send of the RPC Call have been unmapped before
1320 * allowing the RPC to complete. This protects argument
1321 * memory not controlled by the RPC client from being
1322 * re-used before we're done with it.
1323 */
1324 if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
1325 r_xprt->rx_stats.reply_waits_for_send++;
1326 out_of_line_wait_on_bit(&req->rl_flags,
1327 RPCRDMA_REQ_F_TX_RESOURCES,
1328 bit_wait,
1329 TASK_UNINTERRUPTIBLE);
1330 }
1308} 1331}
1309 1332
1310/* Reply handling runs in the poll worker thread. Anything that 1333/* Reply handling runs in the poll worker thread. Anything that
@@ -1384,7 +1407,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1384 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", 1407 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n",
1385 __func__, rep, req, be32_to_cpu(rep->rr_xid)); 1408 __func__, rep, req, be32_to_cpu(rep->rr_xid));
1386 1409
1387 if (list_empty(&req->rl_registered)) 1410 if (list_empty(&req->rl_registered) &&
1411 !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags))
1388 rpcrdma_complete_rqst(rep); 1412 rpcrdma_complete_rqst(rep);
1389 else 1413 else
1390 queue_work(rpcrdma_receive_wq, &rep->rr_work); 1414 queue_work(rpcrdma_receive_wq, &rep->rr_work);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 35aefe201848..9fdd11e4758c 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -789,12 +789,13 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
789 r_xprt->rx_stats.failed_marshal_count, 789 r_xprt->rx_stats.failed_marshal_count,
790 r_xprt->rx_stats.bad_reply_count, 790 r_xprt->rx_stats.bad_reply_count,
791 r_xprt->rx_stats.nomsg_call_count); 791 r_xprt->rx_stats.nomsg_call_count);
792 seq_printf(seq, "%lu %lu %lu %lu %lu\n", 792 seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
793 r_xprt->rx_stats.mrs_recovered, 793 r_xprt->rx_stats.mrs_recovered,
794 r_xprt->rx_stats.mrs_orphaned, 794 r_xprt->rx_stats.mrs_orphaned,
795 r_xprt->rx_stats.mrs_allocated, 795 r_xprt->rx_stats.mrs_allocated,
796 r_xprt->rx_stats.local_inv_needed, 796 r_xprt->rx_stats.local_inv_needed,
797 r_xprt->rx_stats.empty_sendctx_q); 797 r_xprt->rx_stats.empty_sendctx_q,
798 r_xprt->rx_stats.reply_waits_for_send);
798} 799}
799 800
800static int 801static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index bab63adf070b..9a824fe8ffc2 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1526,7 +1526,8 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1526 dprintk("RPC: %s: posting %d s/g entries\n", 1526 dprintk("RPC: %s: posting %d s/g entries\n",
1527 __func__, send_wr->num_sge); 1527 __func__, send_wr->num_sge);
1528 1528
1529 if (!ep->rep_send_count) { 1529 if (!ep->rep_send_count ||
1530 test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
1530 send_wr->send_flags |= IB_SEND_SIGNALED; 1531 send_wr->send_flags |= IB_SEND_SIGNALED;
1531 ep->rep_send_count = ep->rep_send_batch; 1532 ep->rep_send_count = ep->rep_send_batch;
1532 } else { 1533 } else {
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c260475baa36..bccd5d8b9384 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -236,11 +236,13 @@ struct rpcrdma_rep {
236 236
237/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes 237/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
238 */ 238 */
239struct rpcrdma_req;
239struct rpcrdma_xprt; 240struct rpcrdma_xprt;
240struct rpcrdma_sendctx { 241struct rpcrdma_sendctx {
241 struct ib_send_wr sc_wr; 242 struct ib_send_wr sc_wr;
242 struct ib_cqe sc_cqe; 243 struct ib_cqe sc_cqe;
243 struct rpcrdma_xprt *sc_xprt; 244 struct rpcrdma_xprt *sc_xprt;
245 struct rpcrdma_req *sc_req;
244 unsigned int sc_unmap_count; 246 unsigned int sc_unmap_count;
245 struct ib_sge sc_sges[]; 247 struct ib_sge sc_sges[];
246}; 248};
@@ -387,6 +389,7 @@ struct rpcrdma_req {
387enum { 389enum {
388 RPCRDMA_REQ_F_BACKCHANNEL = 0, 390 RPCRDMA_REQ_F_BACKCHANNEL = 0,
389 RPCRDMA_REQ_F_PENDING, 391 RPCRDMA_REQ_F_PENDING,
392 RPCRDMA_REQ_F_TX_RESOURCES,
390}; 393};
391 394
392static inline void 395static inline void
@@ -492,6 +495,7 @@ struct rpcrdma_stats {
492 /* accessed when receiving a reply */ 495 /* accessed when receiving a reply */
493 unsigned long long total_rdma_reply; 496 unsigned long long total_rdma_reply;
494 unsigned long long fixup_copy_count; 497 unsigned long long fixup_copy_count;
498 unsigned long reply_waits_for_send;
495 unsigned long local_inv_needed; 499 unsigned long local_inv_needed;
496 unsigned long nomsg_call_count; 500 unsigned long nomsg_call_count;
497 unsigned long bcall_count; 501 unsigned long bcall_count;