xprtrdma: Remove logic that constructs RDMA_MSGP type calls

RDMA_MSGP type calls insert a zero pad in the middle of the RPC message to align the RPC request's data payload to the server's alignment preferences. A server can then "page flip" the payload into place to avoid a data copy in certain circumstances. However: 1. The client has to have a priori knowledge of the server's preferred alignment 2. Requests eligible for RDMA_MSGP are requests that are small enough to have been sent inline, and convey a data payload at the _end_ of the RPC message Today 1. is done with a sysctl, and is a global setting that is copied during mount. Linux does not support CCP to query the server's preferences (RFC 5666, Section 6). A small-ish NFSv3 WRITE might use RDMA_MSGP, but no NFSv4 compound fits bullet 2. Thus the Linux client currently leaves RDMA_MSGP disabled. The Linux server handles RDMA_MSGP, but does not use any special page flipping, so it confers no benefit. Clean up the marshaling code by removing the logic that constructs RDMA_MSGP type calls. This also reduces the maximum send iovec size from four to just two elements. /proc/sys/sunrpc/rdma_inline_write_padding is a kernel API, and thus is left in place. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Tested-by: Devesh Sharma <devesh.sharma@avagotech.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
author: Chuck Lever <chuck.lever@oracle.com> 2015-08-03 13:03:39 -0400
committer: Anna Schumaker <Anna.Schumaker@Netapp.com> 2015-08-05 16:21:27 -0400
commit: b3221d6a53c44cd572a3a400abdd1e2a24bea587 (patch)
tree: 52a48a8a88cd0b9c1a5b92f5797421f1aa5e642c
parent: d1ed857e5707e073973cfb1b8df801053a356518 (diff)
3 files changed, 51 insertions, 107 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 84ea37daef36..8e9c56429ada 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -297,8 +297,7 @@ out:
 * pre-registered memory buffer for this request. For small amounts
 * of data, this is efficient. The cutoff value is tunable.
 */
-static int
+static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
-rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
 {
        int i, npages, curlen;
        int copy_len;
@@ -310,16 +309,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
        destp = rqst->rq_svec[0].iov_base;
        curlen = rqst->rq_svec[0].iov_len;
        destp += curlen;
-        /*
-         * Do optional padding where it makes sense. Alignment of write
-         * payload can help the server, if our setting is accurate.
-         */
-        pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
-        if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
-                pad = 0;        /* don't pad this request */
-        dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
+        dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
-                __func__, pad, destp, rqst->rq_slen, curlen);
+                __func__, destp, rqst->rq_slen, curlen);
        copy_len = rqst->rq_snd_buf.page_len;
@@ -355,7 +347,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
                page_base = 0;
        }
        /* header now contains entire send message */
-        return pad;
 }
 /*
@@ -380,7 +371,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        char *base;
-        size_t rpclen, padlen;
+        size_t rpclen;
        ssize_t hdrlen;
        enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
@@ -458,7 +449,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        }
        hdrlen = RPCRDMA_HDRLEN_MIN;
-        padlen = 0;
        /*
         * Pull up any extra send data into the preregistered buffer.
@@ -467,43 +457,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         */
        if (rtype == rpcrdma_noch) {
-                padlen = rpcrdma_inline_pullup(rqst,
+                rpcrdma_inline_pullup(rqst);
-                                                RPCRDMA_INLINE_PAD_VALUE(rqst));
+                headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
-                if (padlen) {
+                headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
-                        headerp->rm_type = rdma_msgp;
+                headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
-                        headerp->rm_body.rm_padded.rm_align =
+                /* new length after pullup */
-                                cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
+                rpclen = rqst->rq_svec[0].iov_len;
-                        headerp->rm_body.rm_padded.rm_thresh =
+                /* Currently we try to not actually use read inline.
-                                cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
+                 * Reply chunks have the desirable property that
-                        headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
+                 * they land, packed, directly in the target buffers
-                        headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+                 * without headers, so they require no fixup. The
-                        headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+                 * additional RDMA Write op sends the same amount
-                        hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+                 * of data, streams on-the-wire and adds no overhead
-                        if (wtype != rpcrdma_noch) {
+                 * on receive. Therefore, we request a reply chunk
-                                dprintk("RPC:       %s: invalid chunk list\n",
+                 * for non-writes wherever feasible and efficient.
-                                        __func__);
+                 */
-                                return -EIO;
+                if (wtype == rpcrdma_noch)
-                        }
+                        wtype = rpcrdma_replych;
-                } else {
-                        headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
-                        headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
-                        headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
-                        /* new length after pullup */
-                        rpclen = rqst->rq_svec[0].iov_len;
-                        /*
-                         * Currently we try to not actually use read inline.
-                         * Reply chunks have the desirable property that
-                         * they land, packed, directly in the target buffers
-                         * without headers, so they require no fixup. The
-                         * additional RDMA Write op sends the same amount
-                         * of data, streams on-the-wire and adds no overhead
-                         * on receive. Therefore, we request a reply chunk
-                         * for non-writes wherever feasible and efficient.
-                         */
-                        if (wtype == rpcrdma_noch)
-                                wtype = rpcrdma_replych;
-                }
        }
        if (rtype != rpcrdma_noch) {
@@ -518,9 +489,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        if (hdrlen < 0)
                return hdrlen;
-        dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+        dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
                " headerp 0x%p base 0x%p lkey 0x%x\n",
-                __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+                __func__, transfertypes[wtype], hdrlen, rpclen,
                headerp, base, rdmab_lkey(req->rl_rdmabuf));
        /*
@@ -539,21 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
        req->rl_niovs = 2;
-        if (padlen) {
-                struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-                req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
-                req->rl_send_iov[2].length = padlen;
-                req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
-                req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
-                req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
-                req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
-                req->rl_niovs = 4;
-        }
        return 0;
 }
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 8516d9894599..b4d4f6300fbc 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -605,6 +605,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        struct ib_cq_init_attr cq_attr = {};
        int rc, err;
+        if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+                dprintk("RPC:       %s: insufficient sge's available\n",
+                        __func__);
+                return -ENOMEM;
+        }
        /* check provider's send/recv wr limits */
        if (cdata->max_requests > devattr->max_qp_wr)
                cdata->max_requests = devattr->max_qp_wr;
@@ -617,23 +623,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        if (rc)
                return rc;
        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
-        ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+        ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
        ep->rep_attr.cap.max_recv_sge = 1;
        ep->rep_attr.cap.max_inline_data = 0;
        ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        ep->rep_attr.qp_type = IB_QPT_RC;
        ep->rep_attr.port_num = ~0;
-        if (cdata->padding) {
-                ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
-                                                      GFP_KERNEL);
-                if (IS_ERR(ep->rep_padbuf)) {
-                        rc = PTR_ERR(ep->rep_padbuf);
-                        goto out0;
-                }
-        } else
-                ep->rep_padbuf = NULL;
        dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
                "iovs: send %d recv %d\n",
                __func__,
@@ -716,8 +712,6 @@ out2:
                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
                        __func__, err);
 out1:
-        rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-out0:
        if (ia->ri_dma_mr)
                ib_dereg_mr(ia->ri_dma_mr);
        return rc;
@@ -746,8 +740,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                ia->ri_id->qp = NULL;
        }
-        rpcrdma_free_regbuf(ia, ep->rep_padbuf);
        rpcrdma_clean_cq(ep->rep_attr.recv_cq);
        rc = ib_destroy_cq(ep->rep_attr.recv_cq);
        if (rc)
@@ -1279,9 +1271,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
                struct rpcrdma_ep *ep,
                struct rpcrdma_req *req)
 {
+        struct ib_device *device = ia->ri_device;
        struct ib_send_wr send_wr, *send_wr_fail;
        struct rpcrdma_rep *rep = req->rl_reply;
-        int rc;
+        struct ib_sge *iov = req->rl_send_iov;
+        int i, rc;
        if (rep) {
                rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1292,22 +1286,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
        send_wr.next = NULL;
        send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
-        send_wr.sg_list = req->rl_send_iov;
+        send_wr.sg_list = iov;
        send_wr.num_sge = req->rl_niovs;
        send_wr.opcode = IB_WR_SEND;
-        if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
-                ib_dma_sync_single_for_device(ia->ri_device,
+        for (i = 0; i < send_wr.num_sge; i++)
-                                              req->rl_send_iov[3].addr,
+                ib_dma_sync_single_for_device(device, iov[i].addr,
-                                              req->rl_send_iov[3].length,
+                                              iov[i].length, DMA_TO_DEVICE);
-                                              DMA_TO_DEVICE);
+        dprintk("RPC:       %s: posting %d s/g entries\n",
-        ib_dma_sync_single_for_device(ia->ri_device,
+                __func__, send_wr.num_sge);
-                                      req->rl_send_iov[1].addr,
-                                      req->rl_send_iov[1].length,
-                                      DMA_TO_DEVICE);
-        ib_dma_sync_single_for_device(ia->ri_device,
-                                      req->rl_send_iov[0].addr,
-                                      req->rl_send_iov[0].length,
-                                      DMA_TO_DEVICE);
        if (DECR_CQCOUNT(ep) > 0)
                send_wr.send_flags = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 82190118b8d9..8422c09043b0 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -88,7 +88,6 @@ struct rpcrdma_ep {
        int                     rep_connected;
        struct ib_qp_init_attr  rep_attr;
        wait_queue_head_t       rep_connect_wait;
-        struct rpcrdma_regbuf   *rep_padbuf;
        struct rdma_conn_param  rep_remote_cma;
        struct sockaddr_storage rep_remote_addr;
        struct delayed_work     rep_connect_worker;
@@ -255,16 +254,18 @@ struct rpcrdma_mr_seg {		/* chunk descriptors */
        char            *mr_offset;     /* kva if no page, else offset */
 };
+#define RPCRDMA_MAX_IOVS        (2)
 struct rpcrdma_req {
-        unsigned int    rl_niovs;       /* 0, 2 or 4 */
+        unsigned int            rl_niovs;
-        unsigned int    rl_nchunks;     /* non-zero if chunks */
+        unsigned int            rl_nchunks;
-        unsigned int    rl_connect_cookie;      /* retry detection */
+        unsigned int            rl_connect_cookie;
-        struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+        struct rpcrdma_buffer   *rl_buffer;
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
-        struct ib_sge   rl_send_iov[4]; /* for active requests */
+        struct ib_sge           rl_send_iov[RPCRDMA_MAX_IOVS];
-        struct rpcrdma_regbuf *rl_rdmabuf;
+        struct rpcrdma_regbuf   *rl_rdmabuf;
-        struct rpcrdma_regbuf *rl_sendbuf;
+        struct rpcrdma_regbuf   *rl_sendbuf;
-        struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+        struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
 };
 static inline struct rpcrdma_req *
author	Chuck Lever <chuck.lever@oracle.com>	2015-08-03 13:03:39 -0400
committer	Anna Schumaker <Anna.Schumaker@Netapp.com>	2015-08-05 16:21:27 -0400
commit	b3221d6a53c44cd572a3a400abdd1e2a24bea587 (patch)
tree	52a48a8a88cd0b9c1a5b92f5797421f1aa5e642c
parent	d1ed857e5707e073973cfb1b8df801053a356518 (diff)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 84ea37daef36..8e9c56429ada 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -297,8 +297,7 @@ out:
297	* pre-registered memory buffer for this request. For small amounts	297	* pre-registered memory buffer for this request. For small amounts
298	* of data, this is efficient. The cutoff value is tunable.	298	* of data, this is efficient. The cutoff value is tunable.
299	*/	299	*/
300	static int	300	static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
301	rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
302	{	301	{
303	int i, npages, curlen;	302	int i, npages, curlen;
304	int copy_len;	303	int copy_len;
@@ -310,16 +309,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
310	destp = rqst->rq_svec[0].iov_base;	309	destp = rqst->rq_svec[0].iov_base;
311	curlen = rqst->rq_svec[0].iov_len;	310	curlen = rqst->rq_svec[0].iov_len;
312	destp += curlen;	311	destp += curlen;
313	/*
314	* Do optional padding where it makes sense. Alignment of write
315	* payload can help the server, if our setting is accurate.
316	*/
317	pad -= (curlen + 36/sizeof(struct rpcrdma_msg_padded)/);
318	if (pad < 0 \|\| rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
319	pad = 0; /* don't pad this request */
320		312
321	dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",	313	dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
322	__func__, pad, destp, rqst->rq_slen, curlen);	314	__func__, destp, rqst->rq_slen, curlen);
323		315
324	copy_len = rqst->rq_snd_buf.page_len;	316	copy_len = rqst->rq_snd_buf.page_len;
325		317
@@ -355,7 +347,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
355	page_base = 0;	347	page_base = 0;
356	}	348	}
357	/* header now contains entire send message */	349	/* header now contains entire send message */
358	return pad;
359	}	350	}
360		351
361	/*	352	/*
@@ -380,7 +371,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
380	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);	371	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
381	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);	372	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
382	char *base;	373	char *base;
383	size_t rpclen, padlen;	374	size_t rpclen;
384	ssize_t hdrlen;	375	ssize_t hdrlen;
385	enum rpcrdma_chunktype rtype, wtype;	376	enum rpcrdma_chunktype rtype, wtype;
386	struct rpcrdma_msg *headerp;	377	struct rpcrdma_msg *headerp;
@@ -458,7 +449,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
458	}	449	}
459		450
460	hdrlen = RPCRDMA_HDRLEN_MIN;	451	hdrlen = RPCRDMA_HDRLEN_MIN;
461	padlen = 0;
462		452
463	/*	453	/*
464	* Pull up any extra send data into the preregistered buffer.	454	* Pull up any extra send data into the preregistered buffer.
@@ -467,43 +457,24 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
467	*/	457	*/
468	if (rtype == rpcrdma_noch) {	458	if (rtype == rpcrdma_noch) {
469		459
470	padlen = rpcrdma_inline_pullup(rqst,	460	rpcrdma_inline_pullup(rqst);
471	RPCRDMA_INLINE_PAD_VALUE(rqst));	461
472		462	headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
473	if (padlen) {	463	headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
474	headerp->rm_type = rdma_msgp;	464	headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
475	headerp->rm_body.rm_padded.rm_align =	465	/* new length after pullup */
476	cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));	466	rpclen = rqst->rq_svec[0].iov_len;
477	headerp->rm_body.rm_padded.rm_thresh =	467	/* Currently we try to not actually use read inline.
478	cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);	468	* Reply chunks have the desirable property that
479	headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;	469	* they land, packed, directly in the target buffers
480	headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;	470	* without headers, so they require no fixup. The
481	headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;	471	* additional RDMA Write op sends the same amount
482	hdrlen += 2 * sizeof(u32); /* extra words in padhdr */	472	* of data, streams on-the-wire and adds no overhead
483	if (wtype != rpcrdma_noch) {	473	* on receive. Therefore, we request a reply chunk
484	dprintk("RPC: %s: invalid chunk list\n",	474	* for non-writes wherever feasible and efficient.
485	__func__);	475	*/
486	return -EIO;	476	if (wtype == rpcrdma_noch)
487	}	477	wtype = rpcrdma_replych;
488	} else {
489	headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
490	headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
491	headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
492	/* new length after pullup */
493	rpclen = rqst->rq_svec[0].iov_len;
494	/*
495	* Currently we try to not actually use read inline.
496	* Reply chunks have the desirable property that
497	* they land, packed, directly in the target buffers
498	* without headers, so they require no fixup. The
499	* additional RDMA Write op sends the same amount
500	* of data, streams on-the-wire and adds no overhead
501	* on receive. Therefore, we request a reply chunk
502	* for non-writes wherever feasible and efficient.
503	*/
504	if (wtype == rpcrdma_noch)
505	wtype = rpcrdma_replych;
506	}
507	}	478	}
508		479
509	if (rtype != rpcrdma_noch) {	480	if (rtype != rpcrdma_noch) {
@@ -518,9 +489,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
518	if (hdrlen < 0)	489	if (hdrlen < 0)
519	return hdrlen;	490	return hdrlen;
520		491
521	dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"	492	dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
522	" headerp 0x%p base 0x%p lkey 0x%x\n",	493	" headerp 0x%p base 0x%p lkey 0x%x\n",
523	__func__, transfertypes[wtype], hdrlen, rpclen, padlen,	494	__func__, transfertypes[wtype], hdrlen, rpclen,
524	headerp, base, rdmab_lkey(req->rl_rdmabuf));	495	headerp, base, rdmab_lkey(req->rl_rdmabuf));
525		496
526	/*	497	/*
@@ -539,21 +510,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
539	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);	510	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
540		511
541	req->rl_niovs = 2;	512	req->rl_niovs = 2;
542
543	if (padlen) {
544	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
545
546	req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
547	req->rl_send_iov[2].length = padlen;
548	req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
549
550	req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
551	req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
552	req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
553
554	req->rl_niovs = 4;
555	}
556
557	return 0;	513	return 0;
558	}	514	}
559		515


diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8516d9894599..b4d4f6300fbc 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c
@@ -605,6 +605,12 @@ rpcrdma_ep_create(struct rpcrdma_ep ep, struct rpcrdma_ia ia,
605	struct ib_cq_init_attr cq_attr = {};	605	struct ib_cq_init_attr cq_attr = {};
606	int rc, err;	606	int rc, err;
607		607
		608	if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
		609	dprintk("RPC: %s: insufficient sge's available\n",
		610	__func__);
		611	return -ENOMEM;
		612	}
		613
608	/* check provider's send/recv wr limits */	614	/* check provider's send/recv wr limits */
609	if (cdata->max_requests > devattr->max_qp_wr)	615	if (cdata->max_requests > devattr->max_qp_wr)
610	cdata->max_requests = devattr->max_qp_wr;	616	cdata->max_requests = devattr->max_qp_wr;
@@ -617,23 +623,13 @@ rpcrdma_ep_create(struct rpcrdma_ep ep, struct rpcrdma_ia ia,
617	if (rc)	623	if (rc)
618	return rc;	624	return rc;
619	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;	625	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
620	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);	626	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
621	ep->rep_attr.cap.max_recv_sge = 1;	627	ep->rep_attr.cap.max_recv_sge = 1;
622	ep->rep_attr.cap.max_inline_data = 0;	628	ep->rep_attr.cap.max_inline_data = 0;
623	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;	629	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
624	ep->rep_attr.qp_type = IB_QPT_RC;	630	ep->rep_attr.qp_type = IB_QPT_RC;
625	ep->rep_attr.port_num = ~0;	631	ep->rep_attr.port_num = ~0;
626		632
627	if (cdata->padding) {
628	ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
629	GFP_KERNEL);
630	if (IS_ERR(ep->rep_padbuf)) {
631	rc = PTR_ERR(ep->rep_padbuf);
632	goto out0;
633	}
634	} else
635	ep->rep_padbuf = NULL;
636
637	dprintk("RPC: %s: requested max: dtos: send %d recv %d; "	633	dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
638	"iovs: send %d recv %d\n",	634	"iovs: send %d recv %d\n",
639	__func__,	635	__func__,
@@ -716,8 +712,6 @@ out2:
716	dprintk("RPC: %s: ib_destroy_cq returned %i\n",	712	dprintk("RPC: %s: ib_destroy_cq returned %i\n",
717	__func__, err);	713	__func__, err);
718	out1:	714	out1:
719	rpcrdma_free_regbuf(ia, ep->rep_padbuf);
720	out0:
721	if (ia->ri_dma_mr)	715	if (ia->ri_dma_mr)
722	ib_dereg_mr(ia->ri_dma_mr);	716	ib_dereg_mr(ia->ri_dma_mr);
723	return rc;	717	return rc;
@@ -746,8 +740,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep ep, struct rpcrdma_ia ia)
746	ia->ri_id->qp = NULL;	740	ia->ri_id->qp = NULL;
747	}	741	}
748		742
749	rpcrdma_free_regbuf(ia, ep->rep_padbuf);
750
751	rpcrdma_clean_cq(ep->rep_attr.recv_cq);	743	rpcrdma_clean_cq(ep->rep_attr.recv_cq);
752	rc = ib_destroy_cq(ep->rep_attr.recv_cq);	744	rc = ib_destroy_cq(ep->rep_attr.recv_cq);
753	if (rc)	745	if (rc)
@@ -1279,9 +1271,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1279	struct rpcrdma_ep *ep,	1271	struct rpcrdma_ep *ep,
1280	struct rpcrdma_req *req)	1272	struct rpcrdma_req *req)
1281	{	1273	{
		1274	struct ib_device *device = ia->ri_device;
1282	struct ib_send_wr send_wr, *send_wr_fail;	1275	struct ib_send_wr send_wr, *send_wr_fail;
1283	struct rpcrdma_rep *rep = req->rl_reply;	1276	struct rpcrdma_rep *rep = req->rl_reply;
1284	int rc;	1277	struct ib_sge *iov = req->rl_send_iov;
		1278	int i, rc;
1285		1279
1286	if (rep) {	1280	if (rep) {
1287	rc = rpcrdma_ep_post_recv(ia, ep, rep);	1281	rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1292,22 +1286,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1292		1286
1293	send_wr.next = NULL;	1287	send_wr.next = NULL;
1294	send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;	1288	send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
1295	send_wr.sg_list = req->rl_send_iov;	1289	send_wr.sg_list = iov;
1296	send_wr.num_sge = req->rl_niovs;	1290	send_wr.num_sge = req->rl_niovs;
1297	send_wr.opcode = IB_WR_SEND;	1291	send_wr.opcode = IB_WR_SEND;
1298	if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */	1292
1299	ib_dma_sync_single_for_device(ia->ri_device,	1293	for (i = 0; i < send_wr.num_sge; i++)
1300	req->rl_send_iov[3].addr,	1294	ib_dma_sync_single_for_device(device, iov[i].addr,
1301	req->rl_send_iov[3].length,	1295	iov[i].length, DMA_TO_DEVICE);
1302	DMA_TO_DEVICE);	1296	dprintk("RPC: %s: posting %d s/g entries\n",
1303	ib_dma_sync_single_for_device(ia->ri_device,	1297	__func__, send_wr.num_sge);
1304	req->rl_send_iov[1].addr,
1305	req->rl_send_iov[1].length,
1306	DMA_TO_DEVICE);
1307	ib_dma_sync_single_for_device(ia->ri_device,
1308	req->rl_send_iov[0].addr,
1309	req->rl_send_iov[0].length,
1310	DMA_TO_DEVICE);
1311		1298
1312	if (DECR_CQCOUNT(ep) > 0)	1299	if (DECR_CQCOUNT(ep) > 0)
1313	send_wr.send_flags = 0;	1300	send_wr.send_flags = 0;


diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 82190118b8d9..8422c09043b0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -88,7 +88,6 @@ struct rpcrdma_ep {
88	int rep_connected;	88	int rep_connected;
89	struct ib_qp_init_attr rep_attr;	89	struct ib_qp_init_attr rep_attr;
90	wait_queue_head_t rep_connect_wait;	90	wait_queue_head_t rep_connect_wait;
91	struct rpcrdma_regbuf *rep_padbuf;
92	struct rdma_conn_param rep_remote_cma;	91	struct rdma_conn_param rep_remote_cma;
93	struct sockaddr_storage rep_remote_addr;	92	struct sockaddr_storage rep_remote_addr;
94	struct delayed_work rep_connect_worker;	93	struct delayed_work rep_connect_worker;
@@ -255,16 +254,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
255	char mr_offset; / kva if no page, else offset */	254	char mr_offset; / kva if no page, else offset */
256	};	255	};
257		256
		257	#define RPCRDMA_MAX_IOVS (2)
		258
258	struct rpcrdma_req {	259	struct rpcrdma_req {
259	unsigned int rl_niovs; /* 0, 2 or 4 */	260	unsigned int rl_niovs;
260	unsigned int rl_nchunks; /* non-zero if chunks */	261	unsigned int rl_nchunks;
261	unsigned int rl_connect_cookie; /* retry detection */	262	unsigned int rl_connect_cookie;
262	struct rpcrdma_buffer rl_buffer; / home base for this structure */	263	struct rpcrdma_buffer *rl_buffer;
263	struct rpcrdma_rep rl_reply;/ holder for reply buffer */	264	struct rpcrdma_rep rl_reply;/ holder for reply buffer */
264	struct ib_sge rl_send_iov[4]; /* for active requests */	265	struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
265	struct rpcrdma_regbuf *rl_rdmabuf;	266	struct rpcrdma_regbuf *rl_rdmabuf;
266	struct rpcrdma_regbuf *rl_sendbuf;	267	struct rpcrdma_regbuf *rl_sendbuf;
267	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];	268	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
268	};	269	};
269		270
270	static inline struct rpcrdma_req *	271	static inline struct rpcrdma_req *