aboutsummaryrefslogtreecommitdiffstats
path: root/net/sunrpc/xprtrdma/xprt_rdma.h
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2017-10-20 10:48:12 -0400
committerAnna Schumaker <Anna.Schumaker@Netapp.com>2017-11-17 13:47:56 -0500
commitae72950abf99fb250aca972b3451b6e06a096c68 (patch)
tree101982d10909e16b45ff8491881e6490b5b18635 /net/sunrpc/xprtrdma/xprt_rdma.h
parenta062a2a3efc5fece106d96d4a5165f3f23b5cbda (diff)
xprtrdma: Add data structure to manage RDMA Send arguments
Problem statement: Recently Sagi Grimberg <sagi@grimberg.me> observed that kernel RDMA- enabled storage initiators don't handle delayed Send completion correctly. If Send completion is delayed beyond the end of a ULP transaction, the ULP may release resources that are still being used by the HCA to complete a long-running Send operation. This is a common design trait amongst our initiators. Most Send operations are faster than the ULP transaction they are part of. Waiting for a completion for these is typically unnecessary. Infrequently, a network partition or some other problem crops up where an ordering problem can occur. In NFS parlance, the RPC Reply arrives and completes the RPC, but the HCA is still retrying the Send WR that conveyed the RPC Call. In this case, the HCA can try to use memory that has been invalidated or DMA unmapped, and the connection is lost. If that memory has been re-used for something else (possibly not related to NFS), and the Send retransmission exposes that data on the wire. Thus we cannot assume that it is safe to release Send-related resources just because a ULP reply has arrived. After some analysis, we have determined that the completion housekeeping will not be difficult for xprtrdma: - Inline Send buffers are registered via the local DMA key, and are already left DMA mapped for the lifetime of a transport connection, thus no additional handling is necessary for those - Gathered Sends involving page cache pages _will_ need to DMA unmap those pages after the Send completes. But like inline send buffers, they are registered via the local DMA key, and thus will not need to be invalidated In addition, RPC completion will need to wait for Send completion in the latter case. However, nearly always, the Send that conveys the RPC Call will have completed long before the RPC Reply arrives, and thus no additional latency will be accrued. Design notes: In this patch, the rpcrdma_sendctx object is introduced, and a lock-free circular queue is added to manage a set of them per transport. The RPC client's send path already prevents sending more than one RPC Call at the same time. This allows us to treat the consumer side of the queue (rpcrdma_sendctx_get_locked) as if there is a single consumer thread. The producer side of the queue (rpcrdma_sendctx_put_locked) is invoked only from the Send completion handler, which is a single thread of execution (soft IRQ). The only care that needs to be taken is with the tail index, which is shared between the producer and consumer. Only the producer updates the tail index. The consumer compares the head with the tail to ensure that the a sendctx that is in use is never handed out again (or, expressed more conventionally, the queue is empty). When the sendctx queue empties completely, there are enough Sends outstanding that posting more Send operations can result in a Send Queue overflow. In this case, the ULP is told to wait and try again. This introduces strong Send Queue accounting to xprtrdma. As a final touch, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> suggested a mechanism that does not require signaling every Send. We signal once every N Sends, and perform SGE unmapping of N Send operations during that one completion. Reported-by: Sagi Grimberg <sagi@grimberg.me> Suggested-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Diffstat (limited to 'net/sunrpc/xprtrdma/xprt_rdma.h')
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h38
1 files changed, 33 insertions, 5 deletions
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0b8ca5e5c706..537cfabe47d1 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -93,6 +93,8 @@ enum {
93 */ 93 */
94 94
95struct rpcrdma_ep { 95struct rpcrdma_ep {
96 unsigned int rep_send_count;
97 unsigned int rep_send_batch;
96 atomic_t rep_cqcount; 98 atomic_t rep_cqcount;
97 int rep_cqinit; 99 int rep_cqinit;
98 int rep_connected; 100 int rep_connected;
@@ -232,6 +234,27 @@ struct rpcrdma_rep {
232 struct ib_recv_wr rr_recv_wr; 234 struct ib_recv_wr rr_recv_wr;
233}; 235};
234 236
237/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
238 */
239struct rpcrdma_xprt;
240struct rpcrdma_sendctx {
241 struct ib_send_wr sc_wr;
242 struct ib_cqe sc_cqe;
243 struct rpcrdma_xprt *sc_xprt;
244 unsigned int sc_unmap_count;
245 struct ib_sge sc_sges[];
246};
247
248/* Limit the number of SGEs that can be unmapped during one
249 * Send completion. This caps the amount of work a single
250 * completion can do before returning to the provider.
251 *
252 * Setting this to zero disables Send completion batching.
253 */
254enum {
255 RPCRDMA_MAX_SEND_BATCH = 7,
256};
257
235/* 258/*
236 * struct rpcrdma_mw - external memory region metadata 259 * struct rpcrdma_mw - external memory region metadata
237 * 260 *
@@ -343,19 +366,16 @@ enum {
343struct rpcrdma_buffer; 366struct rpcrdma_buffer;
344struct rpcrdma_req { 367struct rpcrdma_req {
345 struct list_head rl_list; 368 struct list_head rl_list;
346 unsigned int rl_mapped_sges;
347 unsigned int rl_connect_cookie; 369 unsigned int rl_connect_cookie;
348 struct rpcrdma_buffer *rl_buffer; 370 struct rpcrdma_buffer *rl_buffer;
349 struct rpcrdma_rep *rl_reply; 371 struct rpcrdma_rep *rl_reply;
350 struct xdr_stream rl_stream; 372 struct xdr_stream rl_stream;
351 struct xdr_buf rl_hdrbuf; 373 struct xdr_buf rl_hdrbuf;
352 struct ib_send_wr rl_send_wr; 374 struct rpcrdma_sendctx *rl_sendctx;
353 struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES];
354 struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ 375 struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
355 struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ 376 struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
356 struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ 377 struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
357 378
358 struct ib_cqe rl_cqe;
359 struct list_head rl_all; 379 struct list_head rl_all;
360 bool rl_backchannel; 380 bool rl_backchannel;
361 381
@@ -402,6 +422,11 @@ struct rpcrdma_buffer {
402 struct list_head rb_mws; 422 struct list_head rb_mws;
403 struct list_head rb_all; 423 struct list_head rb_all;
404 424
425 unsigned long rb_sc_head;
426 unsigned long rb_sc_tail;
427 unsigned long rb_sc_last;
428 struct rpcrdma_sendctx **rb_sc_ctxs;
429
405 spinlock_t rb_lock; /* protect buf lists */ 430 spinlock_t rb_lock; /* protect buf lists */
406 int rb_send_count, rb_recv_count; 431 int rb_send_count, rb_recv_count;
407 struct list_head rb_send_bufs; 432 struct list_head rb_send_bufs;
@@ -456,6 +481,7 @@ struct rpcrdma_stats {
456 unsigned long mrs_recovered; 481 unsigned long mrs_recovered;
457 unsigned long mrs_orphaned; 482 unsigned long mrs_orphaned;
458 unsigned long mrs_allocated; 483 unsigned long mrs_allocated;
484 unsigned long empty_sendctx_q;
459 485
460 /* accessed when receiving a reply */ 486 /* accessed when receiving a reply */
461 unsigned long long total_rdma_reply; 487 unsigned long long total_rdma_reply;
@@ -557,6 +583,8 @@ struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
557void rpcrdma_destroy_req(struct rpcrdma_req *); 583void rpcrdma_destroy_req(struct rpcrdma_req *);
558int rpcrdma_buffer_create(struct rpcrdma_xprt *); 584int rpcrdma_buffer_create(struct rpcrdma_xprt *);
559void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 585void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
586struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
587void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
560 588
561struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); 589struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
562void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); 590void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
@@ -617,7 +645,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
617 struct rpcrdma_req *req, u32 hdrlen, 645 struct rpcrdma_req *req, u32 hdrlen,
618 struct xdr_buf *xdr, 646 struct xdr_buf *xdr,
619 enum rpcrdma_chunktype rtype); 647 enum rpcrdma_chunktype rtype);
620void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); 648void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc);
621int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); 649int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
622void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); 650void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
623void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); 651void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);