aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2017-12-04 14:04:04 -0500
committerAnna Schumaker <Anna.Schumaker@Netapp.com>2017-12-15 14:31:50 -0500
commitccede7598588ae344143f82fb763912535648d58 (patch)
tree293d68c7698b8545a6ef064c6ab9d8138e855fdd
parentc156618e15101a9cc8c815108fec0300a0ec6637 (diff)
xprtrdma: Spread reply processing over more CPUs
Commit d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler directly from RECV completion") introduced a performance regression for NFS I/O small enough to not need memory registration. In multi- threaded benchmarks that generate primarily small I/O requests, IOPS throughput is reduced by nearly a third. This patch restores the previous level of throughput. Because workqueues are typically BOUND (in particular ib_comp_wq, nfsiod_workqueue, and rpciod_workqueue), NFS/RDMA workloads tend to aggregate on the CPU that is handling Receive completions. The usual approach to addressing this problem is to create a QP and CQ for each CPU, and then schedule transactions on the QP for the CPU where you want the transaction to complete. The transaction then does not require an extra context switch during completion to end up on the same CPU where the transaction was started. This approach doesn't work for the Linux NFS/RDMA client because currently the Linux NFS client does not support multiple connections per client-server pair, and the RDMA core API does not make it straightforward for ULPs to determine which CPU is responsible for handling Receive completions for a CQ. So for the moment, record the CPU number in the rpcrdma_req before the transport sends each RPC Call. Then during Receive completion, queue the RPC completion on that same CPU. Additionally, move all RPC completion processing to the deferred handler so that even RPCs with simple small replies complete on the CPU that sent the corresponding RPC Call. Fixes: d8f532d20ee4 ("xprtrdma: Invoke rpcrdma_reply_handler ...") Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c6
-rw-r--r--net/sunrpc/xprtrdma/transport.c2
-rw-r--r--net/sunrpc/xprtrdma/verbs.c2
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h1
4 files changed, 5 insertions, 6 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index ed34dc0f144c..a3f2ab283aeb 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1408,11 +1408,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1408 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", 1408 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n",
1409 __func__, rep, req, be32_to_cpu(rep->rr_xid)); 1409 __func__, rep, req, be32_to_cpu(rep->rr_xid));
1410 1410
1411 if (list_empty(&req->rl_registered) && 1411 queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work);
1412 !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags))
1413 rpcrdma_complete_rqst(rep);
1414 else
1415 queue_work(rpcrdma_receive_wq, &rep->rr_work);
1416 return; 1412 return;
1417 1413
1418out_badstatus: 1414out_badstatus:
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 646c24494ea7..6ee1ad8978f3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -52,6 +52,7 @@
52#include <linux/slab.h> 52#include <linux/slab.h>
53#include <linux/seq_file.h> 53#include <linux/seq_file.h>
54#include <linux/sunrpc/addr.h> 54#include <linux/sunrpc/addr.h>
55#include <linux/smp.h>
55 56
56#include "xprt_rdma.h" 57#include "xprt_rdma.h"
57 58
@@ -656,6 +657,7 @@ xprt_rdma_allocate(struct rpc_task *task)
656 task->tk_pid, __func__, rqst->rq_callsize, 657 task->tk_pid, __func__, rqst->rq_callsize,
657 rqst->rq_rcvsize, req); 658 rqst->rq_rcvsize, req);
658 659
660 req->rl_cpu = smp_processor_id();
659 req->rl_connect_cookie = 0; /* our reserved value */ 661 req->rl_connect_cookie = 0; /* our reserved value */
660 rpcrdma_set_xprtdata(rqst, req); 662 rpcrdma_set_xprtdata(rqst, req);
661 rqst->rq_buffer = req->rl_sendbuf->rg_base; 663 rqst->rq_buffer = req->rl_sendbuf->rg_base;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 710b3f77db82..8607c029c0dd 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -83,7 +83,7 @@ rpcrdma_alloc_wq(void)
83 struct workqueue_struct *recv_wq; 83 struct workqueue_struct *recv_wq;
84 84
85 recv_wq = alloc_workqueue("xprtrdma_receive", 85 recv_wq = alloc_workqueue("xprtrdma_receive",
86 WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI, 86 WQ_MEM_RECLAIM | WQ_HIGHPRI,
87 0); 87 0);
88 if (!recv_wq) 88 if (!recv_wq)
89 return -ENOMEM; 89 return -ENOMEM;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 51686d9eac5f..1342f743f1c4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -342,6 +342,7 @@ enum {
342struct rpcrdma_buffer; 342struct rpcrdma_buffer;
343struct rpcrdma_req { 343struct rpcrdma_req {
344 struct list_head rl_list; 344 struct list_head rl_list;
345 int rl_cpu;
345 unsigned int rl_connect_cookie; 346 unsigned int rl_connect_cookie;
346 struct rpcrdma_buffer *rl_buffer; 347 struct rpcrdma_buffer *rl_buffer;
347 struct rpcrdma_rep *rl_reply; 348 struct rpcrdma_rep *rl_reply;