23 files changed, 1334 insertions, 903 deletions
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e7c8b9c76e48..5d481e8a1b5d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -132,6 +132,8 @@ lockd(void *vrqstp)
 {
        int             err = 0;
        struct svc_rqst *rqstp = vrqstp;
+        struct net *net = &init_net;
+        struct lockd_net *ln = net_generic(net, lockd_net_id);
        /* try_to_freeze() is called from svc_recv() */
        set_freezable();
@@ -176,6 +178,8 @@ lockd(void *vrqstp)
        if (nlmsvc_ops)
                nlmsvc_invalidate_all();
        nlm_shutdown_hosts();
+        cancel_delayed_work_sync(&ln->grace_period_end);
+        locks_end_grace(&ln->lockd_manager);
        return 0;
 }
@@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
        if (ln->nlmsvc_users) {
                if (--ln->nlmsvc_users == 0) {
                        nlm_shutdown_hosts_net(net);
-                        cancel_delayed_work_sync(&ln->grace_period_end);
-                        locks_end_grace(&ln->lockd_manager);
                        svc_shutdown_net(serv, net);
                        dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
                }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 5581e020644b..3507c80d1d4b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -870,15 +870,15 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
        if (!(block = nlmsvc_find_block(cookie)))
                return;
-        if (block) {
+        if (status == nlm_lck_denied_grace_period) {
-                if (status == nlm_lck_denied_grace_period) {
+                /* Try again in a couple of seconds */
-                        /* Try again in a couple of seconds */
+                nlmsvc_insert_block(block, 10 * HZ);
-                        nlmsvc_insert_block(block, 10 * HZ);
+        } else {
-                } else {
+                /*
-                        /* Lock is now held by client, or has been rejected.
+                 * Lock is now held by client, or has been rejected.
-                         * In both cases, the block should be removed. */
+                 * In both cases, the block should be removed.
-                        nlmsvc_unlink_block(block);
+                 */
-                }
+                nlmsvc_unlink_block(block);
        }
        nlmsvc_release_block(block);
 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 773774531aff..73a1f928226c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp)
        set_freezable();
-        while (!kthread_should_stop()) {
+        while (!kthread_freezable_should_stop(NULL)) {
+                if (signal_pending(current))
+                        flush_signals(current);
                /*
                 * Listen for a request on the socket
                 */
@@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp)
                        continue;
                svc_process(rqstp);
        }
+        svc_exit_thread(rqstp);
+        module_put_and_exit(0);
        return 0;
 }
@@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp)
        set_freezable();
-        while (!kthread_should_stop()) {
+        while (!kthread_freezable_should_stop(NULL)) {
-                if (try_to_freeze())
-                        continue;
+                if (signal_pending(current))
+                        flush_signals(current);
                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
                spin_lock_bh(&serv->sv_cb_lock);
@@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp)
                                error);
                } else {
                        spin_unlock_bh(&serv->sv_cb_lock);
-                        schedule();
+                        if (!kthread_should_stop())
+                                schedule();
                        finish_wait(&serv->sv_cb_waitq, &wq);
                }
-                flush_signals(current);
        }
+        svc_exit_thread(rqstp);
+        module_put_and_exit(0);
        return 0;
 }
@@ -221,14 +229,14 @@ err_bind:
 static struct svc_serv_ops nfs40_cb_sv_ops = {
        .svo_function           = nfs4_callback_svc,
        .svo_enqueue_xprt       = svc_xprt_do_enqueue,
-        .svo_setup              = svc_set_num_threads,
+        .svo_setup              = svc_set_num_threads_sync,
        .svo_module             = THIS_MODULE,
 };
 #if defined(CONFIG_NFS_V4_1)
 static struct svc_serv_ops nfs41_cb_sv_ops = {
        .svo_function           = nfs41_callback_svc,
        .svo_enqueue_xprt       = svc_xprt_do_enqueue,
-        .svo_setup              = svc_set_num_threads,
+        .svo_setup              = svc_set_num_threads_sync,
        .svo_module             = THIS_MODULE,
 };
@@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
                printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
                        cb_info->users);
-        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+        serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
        if (!serv) {
                printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
                return ERR_PTR(-ENOMEM);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 452334694a5d..12feac6ee2fd 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -334,8 +334,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
        if (!p)
                return 0;
        p = xdr_decode_hyper(p, &args->offset);
        args->count = ntohl(*p++);
+        if (!xdr_argsize_check(rqstp, p))
+                return 0;
        len = min(args->count, max_blocksize);
        /* set up the kvec */
@@ -349,7 +352,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                v++;
        }
        args->vlen = v;
-        return xdr_argsize_check(rqstp, p);
+        return 1;
 }
 int
@@ -541,9 +544,11 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
        p = decode_fh(p, &args->fh);
        if (!p)
                return 0;
+        if (!xdr_argsize_check(rqstp, p))
+                return 0;
        args->buffer = page_address(*(rqstp->rq_next_page++));
-        return xdr_argsize_check(rqstp, p);
+        return 1;
 }
 int
@@ -569,10 +574,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
        args->verf   = p; p += 2;
        args->dircount = ~0;
        args->count  = ntohl(*p++);
+        if (!xdr_argsize_check(rqstp, p))
+                return 0;
        args->count  = min_t(u32, args->count, PAGE_SIZE);
        args->buffer = page_address(*(rqstp->rq_next_page++));
-        return xdr_argsize_check(rqstp, p);
+        return 1;
 }
 int
@@ -590,6 +599,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
        args->dircount = ntohl(*p++);
        args->count    = ntohl(*p++);
+        if (!xdr_argsize_check(rqstp, p))
+                return 0;
        len = args->count = min(args->count, max_blocksize);
        while (len > 0) {
                struct page *p = *(rqstp->rq_next_page++);
@@ -597,8 +609,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
                        args->buffer = page_address(p);
                len -= PAGE_SIZE;
        }
+        return 1;
-        return xdr_argsize_check(rqstp, p);
 }
 int
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d86031b6ad79..c453a1998e00 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
                return NULL;
        }
-        if (!(exp->ex_layout_types & (1 << layout_type))) {
+        if (layout_type >= LAYOUT_TYPE_MAX ||
+            !(exp->ex_layout_types & (1 << layout_type))) {
                dprintk("%s: layout type %d not supported\n",
                        __func__, layout_type);
                return NULL;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e9ef50addddb..22002fb75a18 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
        target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
 }
-int strdup_if_nonnull(char **target, char *source)
-{
-        if (source) {
-                *target = kstrdup(source, GFP_KERNEL);
-                if (!*target)
-                        return -ENOMEM;
-        } else
-                *target = NULL;
-        return 0;
-}
 static int copy_cred(struct svc_cred *target, struct svc_cred *source)
 {
-        int ret;
+        target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL);
+        target->cr_raw_principal = kstrdup(source->cr_raw_principal,
+                                                                GFP_KERNEL);
+        if ((source->cr_principal && ! target->cr_principal) ||
+            (source->cr_raw_principal && ! target->cr_raw_principal))
+                return -ENOMEM;
-        ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
-        if (ret)
-                return ret;
-        ret = strdup_if_nonnull(&target->cr_raw_principal,
-                                        source->cr_raw_principal);
-        if (ret)
-                return ret;
        target->cr_flavor = source->cr_flavor;
        target->cr_uid = source->cr_uid;
        target->cr_gid = source->cr_gid;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 33017d652b1d..26780d53a6f9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2831,9 +2831,14 @@ out_acl:
        }
 #endif /* CONFIG_NFSD_PNFS */
        if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
-                status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0,
+                u32 supp[3];
-                                                  NFSD_SUPPATTR_EXCLCREAT_WORD1,
-                                                  NFSD_SUPPATTR_EXCLCREAT_WORD2);
+                memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+                supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+                supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+                supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+                status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
                if (status)
                        goto out;
        }
@@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
                struct nfsd4_getdeviceinfo *gdev)
 {
        struct xdr_stream *xdr = &resp->xdr;
-        const struct nfsd4_layout_ops *ops =
+        const struct nfsd4_layout_ops *ops;
-                nfsd4_layout_ops[gdev->gd_layout_type];
        u32 starting_len = xdr->buf->len, needed_len;
        __be32 *p;
@@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
        /* If maxcount is 0 then just update notifications */
        if (gdev->gd_maxcount != 0) {
+                ops = nfsd4_layout_ops[gdev->gd_layout_type];
                nfserr = ops->encode_getdeviceinfo(xdr, gdev);
                if (nfserr) {
                        /*
@@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
                struct nfsd4_layoutget *lgp)
 {
        struct xdr_stream *xdr = &resp->xdr;
-        const struct nfsd4_layout_ops *ops =
+        const struct nfsd4_layout_ops *ops;
-                nfsd4_layout_ops[lgp->lg_layout_type];
        __be32 *p;
        dprintk("%s: err %d\n", __func__, nfserr);
@@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
        *p++ = cpu_to_be32(lgp->lg_seg.iomode);
        *p++ = cpu_to_be32(lgp->lg_layout_type);
+        ops = nfsd4_layout_ops[lgp->lg_layout_type];
        nfserr = ops->encode_layoutget(xdr, lgp);
 out:
        kfree(lgp->lg_content);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index de07ff625777..6a4947a3f4fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -257,6 +257,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
        len = args->count     = ntohl(*p++);
        p++; /* totalcount - unused */
+        if (!xdr_argsize_check(rqstp, p))
+                return 0;
        len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
        /* set up somewhere to store response.
@@ -272,7 +275,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                v++;
        }
        args->vlen = v;
-        return xdr_argsize_check(rqstp, p);
+        return 1;
 }
 int
@@ -362,9 +365,11 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
        p = decode_fh(p, &args->fh);
        if (!p)
                return 0;
+        if (!xdr_argsize_check(rqstp, p))
+                return 0;
        args->buffer = page_address(*(rqstp->rq_next_page++));
-        return xdr_argsize_check(rqstp, p);
+        return 1;
 }
 int
@@ -402,9 +407,11 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
        args->cookie = ntohl(*p++);
        args->count  = ntohl(*p++);
        args->count  = min_t(u32, args->count, PAGE_SIZE);
+        if (!xdr_argsize_check(rqstp, p))
+                return 0;
        args->buffer = page_address(*(rqstp->rq_next_page++));
-        return xdr_argsize_check(rqstp, p);
+        return 1;
 }
 /*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9aaf6ca77569..2be32955d7f2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        err = follow_down(&path);
        if (err < 0)
                goto out;
+        if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
+            nfsd_mountpoint(dentry, exp) == 2) {
+                /* This is only a mountpoint in some other namespace */
+                path_put(&path);
+                goto out;
+        }
        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
@@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st
 /*
 * For nfsd purposes, we treat V4ROOT exports as though there was an
 * export at *every* directory.
+ * We return:
+ * '1' if this dentry *must* be an export point,
+ * '2' if it might be, if there is really a mount here, and
+ * '0' if there is no chance of an export point here.
 */
 int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
 {
-        if (d_mountpoint(dentry))
+        if (!d_inode(dentry))
+                return 0;
+        if (exp->ex_flags & NFSEXP_V4ROOT)
                return 1;
        if (nfsd4_is_junction(dentry))
                return 1;
-        if (!(exp->ex_flags & NFSEXP_V4ROOT))
+        if (d_mountpoint(dentry))
-                return 0;
+                /*
-        return d_inode(dentry) != NULL;
+                 * Might only be a mountpoint in a different namespace,
+                 * but we need to check.
+                 */
+                return 2;
+        return 0;
 }
 __be32
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 245fc59b7324..b7e85b341a54 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -143,6 +143,9 @@ enum rpcrdma_proc {
 #define rdma_done       cpu_to_be32(RDMA_DONE)
 #define rdma_error      cpu_to_be32(RDMA_ERROR)
+#define err_vers        cpu_to_be32(ERR_VERS)
+#define err_chunk       cpu_to_be32(ERR_CHUNK)
 /*
 * Private extension to RPC-over-RDMA Version One.
 * Message passed during RDMA-CM connection set-up.
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e770abeed32d..94631026f79c 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -336,8 +336,7 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p)
 {
        char *cp = (char *)p;
        struct kvec *vec = &rqstp->rq_arg.head[0];
-        return cp >= (char*)vec->iov_base
+        return cp == (char *)vec->iov_base + vec->iov_len;
-                && cp <= (char*)vec->iov_base + vec->iov_len;
 }
 static inline int
@@ -474,6 +473,7 @@ void		   svc_pool_map_put(void);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
                        struct svc_serv_ops *);
 int                svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
+int                svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
 int                svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void               svc_destroy(struct svc_serv *);
 void               svc_shutdown_net(struct svc_serv *, struct net *);
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index b105f73e3ca2..f3787d800ba4 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -48,6 +48,12 @@
 #include <rdma/rdma_cm.h>
 #define SVCRDMA_DEBUG
+/* Default and maximum inline threshold sizes */
+enum {
+        RPCRDMA_DEF_INLINE_THRESH = 4096,
+        RPCRDMA_MAX_INLINE_THRESH = 65536
+};
 /* RPC/RDMA parameters and stats */
 extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
@@ -85,27 +91,11 @@ struct svc_rdma_op_ctxt {
        enum dma_data_direction direction;
        int count;
        unsigned int mapped_sges;
-        struct ib_sge sge[RPCSVC_MAXPAGES];
+        struct ib_send_wr send_wr;
+        struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
        struct page *pages[RPCSVC_MAXPAGES];
 };
-/*
- * NFS_ requests are mapped on the client side by the chunk lists in
- * the RPCRDMA header. During the fetching of the RPC from the client
- * and the writing of the reply to the client, the memory in the
- * client and the memory in the server must be mapped as contiguous
- * vaddr/len for access by the hardware. These data strucures keep
- * these mappings.
- *
- * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the
- * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the
- * 'ch' field maps the read-list of the RPCRDMA header to the 'sge'
- * mapping of the reply.
- */
-struct svc_rdma_chunk_sge {
-        int start;              /* sge no for this chunk */
-        int count;              /* sge count for this chunk */
-};
 struct svc_rdma_fastreg_mr {
        struct ib_mr *mr;
        struct scatterlist *sg;
@@ -114,15 +104,7 @@ struct svc_rdma_fastreg_mr {
        enum dma_data_direction direction;
        struct list_head frmr_list;
 };
-struct svc_rdma_req_map {
-        struct list_head free;
-        unsigned long count;
-        union {
-                struct kvec sge[RPCSVC_MAXPAGES];
-                struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
-                unsigned long lkey[RPCSVC_MAXPAGES];
-        };
-};
 #define RDMACTXT_F_LAST_CTXT    2
 #define SVCRDMA_DEVCAP_FAST_REG         1       /* fast mr registration */
@@ -144,14 +126,15 @@ struct svcxprt_rdma {
        u32                  sc_max_requests;   /* Max requests */
        u32                  sc_max_bc_requests;/* Backward credits */
        int                  sc_max_req_size;   /* Size of each RQ WR buf */
+        u8                   sc_port_num;
        struct ib_pd         *sc_pd;
        spinlock_t           sc_ctxt_lock;
        struct list_head     sc_ctxts;
        int                  sc_ctxt_used;
-        spinlock_t           sc_map_lock;
+        spinlock_t           sc_rw_ctxt_lock;
-        struct list_head     sc_maps;
+        struct list_head     sc_rw_ctxts;
        struct list_head     sc_rq_dto_q;
        spinlock_t           sc_rq_dto_lock;
@@ -181,9 +164,7 @@ struct svcxprt_rdma {
 /* The default ORD value is based on two outstanding full-size writes with a
 * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
 #define RPCRDMA_ORD             (64/4)
-#define RPCRDMA_SQ_DEPTH_MULT   8
 #define RPCRDMA_MAX_REQUESTS    32
-#define RPCRDMA_MAX_REQ_SIZE    4096
 /* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
 * current NFSv4.1 implementation supports one backchannel slot.
@@ -201,19 +182,11 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
 /* svc_rdma_backchannel.c */
 extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
-                                    struct rpcrdma_msg *rmsgp,
+                                    __be32 *rdma_resp,
                                    struct xdr_buf *rcvbuf);
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
-extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
-                                     struct rpcrdma_msg *,
-                                     enum rpcrdma_errcode, __be32 *);
-extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
-extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
-extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
-                                            __be32, __be64, u32);
-extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
@@ -224,16 +197,25 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
                                struct svc_rdma_op_ctxt *, int *, u32 *,
                                u32, u32, u64, bool);
+/* svc_rdma_rw.c */
+extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
+extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+                                     __be32 *wr_ch, struct xdr_buf *xdr);
+extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+                                     __be32 *rp_ch, bool writelist,
+                                     struct xdr_buf *xdr);
 /* svc_rdma_sendto.c */
-extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
+extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
-                            struct svc_rdma_req_map *, bool);
+                                  struct svc_rdma_op_ctxt *ctxt,
+                                  __be32 *rdma_resp, unsigned int len);
+extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+                                 struct svc_rdma_op_ctxt *ctxt,
+                                 int num_sge, u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
-extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
-                                int);
 /* svc_rdma_transport.c */
 extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
-extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *);
@@ -244,9 +226,6 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
 extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
-extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
-extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
-                                 struct svc_rdma_req_map *);
 extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
 extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
                              struct svc_rdma_fastreg_mr *);
diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
index f14a9ab06f1f..ec260274be0c 100644
--- a/include/uapi/linux/nfsd/cld.h
+++ b/include/uapi/linux/nfsd/cld.h
@@ -22,6 +22,8 @@
 #ifndef _NFSD_CLD_H
 #define _NFSD_CLD_H
+#include <linux/types.h>
 /* latest upcall version available */
 #define CLD_UPCALL_VERSION 1
@@ -37,18 +39,18 @@ enum cld_command {
 /* representation of long-form NFSv4 client ID */
 struct cld_name {
-        uint16_t        cn_len;                         /* length of cm_id */
+        __u16           cn_len;                         /* length of cm_id */
        unsigned char   cn_id[NFS4_OPAQUE_LIMIT];       /* client-provided */
 } __attribute__((packed));
 /* message struct for communication with userspace */
 struct cld_msg {
-        uint8_t         cm_vers;                /* upcall version */
+        __u8            cm_vers;                /* upcall version */
-        uint8_t         cm_cmd;                 /* upcall command */
+        __u8            cm_cmd;                 /* upcall command */
-        int16_t         cm_status;              /* return code */
+        __s16           cm_status;              /* return code */
-        uint32_t        cm_xid;                 /* transaction id */
+        __u32           cm_xid;                 /* transaction id */
        union {
-                int64_t         cm_gracetime;   /* grace period start time */
+                __s64           cm_gracetime;   /* grace period start time */
                struct cld_name cm_name;
        } __attribute__((packed)) cm_u;
 } __attribute__((packed));
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 04ce2c0b660e..ac09ca803296 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
        tristate "RPC-over-RDMA transport"
        depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
        default SUNRPC && INFINIBAND
+        select SG_POOL
        help
          This option allows the NFS client and server to use RDMA
          transports (InfiniBand, iWARP, or RoCE).
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a08aeb56b8e4..bc0f5a0ecbdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -702,59 +702,32 @@ found_pool:
        return task;
 }
-/*
+/* create new threads */
- * Create or destroy enough new threads to make the number
+static int
- * of threads the given number.  If `pool' is non-NULL, applies
+svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
- * only to threads in that pool, otherwise round-robins between
- * all pools.  Caller must ensure that mutual exclusion between this and
- * server startup or shutdown.
- *
- * Destroying threads relies on the service threads filling in
- * rqstp->rq_task, which only the nfs ones do.  Assumes the serv
- * has been created using svc_create_pooled().
- *
- * Based on code that used to be in nfsd_svc() but tweaked
- * to be pool-aware.
- */
-int
-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
        struct svc_rqst *rqstp;
        struct task_struct *task;
        struct svc_pool *chosen_pool;
-        int error = 0;
        unsigned int state = serv->sv_nrthreads-1;
        int node;
-        if (pool == NULL) {
+        do {
-                /* The -1 assumes caller has done a svc_get() */
-                nrservs -= (serv->sv_nrthreads-1);
-        } else {
-                spin_lock_bh(&pool->sp_lock);
-                nrservs -= pool->sp_nrthreads;
-                spin_unlock_bh(&pool->sp_lock);
-        }
-        /* create new threads */
-        while (nrservs > 0) {
                nrservs--;
                chosen_pool = choose_pool(serv, pool, &state);
                node = svc_pool_map_get_node(chosen_pool->sp_id);
                rqstp = svc_prepare_thread(serv, chosen_pool, node);
-                if (IS_ERR(rqstp)) {
+                if (IS_ERR(rqstp))
-                        error = PTR_ERR(rqstp);
+                        return PTR_ERR(rqstp);
-                        break;
-                }
                __module_get(serv->sv_ops->svo_module);
                task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
                                              node, "%s", serv->sv_name);
                if (IS_ERR(task)) {
-                        error = PTR_ERR(task);
                        module_put(serv->sv_ops->svo_module);
                        svc_exit_thread(rqstp);
-                        break;
+                        return PTR_ERR(task);
                }
                rqstp->rq_task = task;
@@ -763,18 +736,103 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
                svc_sock_update_bufs(serv);
                wake_up_process(task);
-        }
+        } while (nrservs > 0);
+        return 0;
+}
+/* destroy old threads */
+static int
+svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+        struct task_struct *task;
+        unsigned int state = serv->sv_nrthreads-1;
        /* destroy old threads */
-        while (nrservs < 0 &&
+        do {
-               (task = choose_victim(serv, pool, &state)) != NULL) {
+                task = choose_victim(serv, pool, &state);
+                if (task == NULL)
+                        break;
                send_sig(SIGINT, task, 1);
                nrservs++;
+        } while (nrservs < 0);
+        return 0;
+}
+/*
+ * Create or destroy enough new threads to make the number
+ * of threads the given number.  If `pool' is non-NULL, applies
+ * only to threads in that pool, otherwise round-robins between
+ * all pools.  Caller must ensure that mutual exclusion between this and
+ * server startup or shutdown.
+ *
+ * Destroying threads relies on the service threads filling in
+ * rqstp->rq_task, which only the nfs ones do.  Assumes the serv
+ * has been created using svc_create_pooled().
+ *
+ * Based on code that used to be in nfsd_svc() but tweaked
+ * to be pool-aware.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+        if (pool == NULL) {
+                /* The -1 assumes caller has done a svc_get() */
+                nrservs -= (serv->sv_nrthreads-1);
+        } else {
+                spin_lock_bh(&pool->sp_lock);
+                nrservs -= pool->sp_nrthreads;
+                spin_unlock_bh(&pool->sp_lock);
        }
-        return error;
+        if (nrservs > 0)
+                return svc_start_kthreads(serv, pool, nrservs);
+        if (nrservs < 0)
+                return svc_signal_kthreads(serv, pool, nrservs);
+        return 0;
 }
 EXPORT_SYMBOL_GPL(svc_set_num_threads);
+/* destroy old threads */
+static int
+svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+        struct task_struct *task;
+        unsigned int state = serv->sv_nrthreads-1;
+        /* destroy old threads */
+        do {
+                task = choose_victim(serv, pool, &state);
+                if (task == NULL)
+                        break;
+                kthread_stop(task);
+                nrservs++;
+        } while (nrservs < 0);
+        return 0;
+}
+int
+svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+        if (pool == NULL) {
+                /* The -1 assumes caller has done a svc_get() */
+                nrservs -= (serv->sv_nrthreads-1);
+        } else {
+                spin_lock_bh(&pool->sp_lock);
+                nrservs -= pool->sp_nrthreads;
+                spin_unlock_bh(&pool->sp_lock);
+        }
+        if (nrservs > 0)
+                return svc_start_kthreads(serv, pool, nrservs);
+        if (nrservs < 0)
+                return svc_stop_kthreads(serv, pool, nrservs);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
 /*
 * Called from a server thread as it's exiting. Caller must hold the "service
 * mutex" for the service.
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index ef19fa42c50f..c1ae8142ab73 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
        fmr_ops.o frwr_ops.o \
        svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
        svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
-        module.o
+        svc_rdma_rw.o module.o
 rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c846ca9f1eba..a4a8f6989ee7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
 unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
 static unsigned int min_max_requests = 4;
 static unsigned int max_max_requests = 16384;
-unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
-static unsigned int min_max_inline = 4096;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
-static unsigned int max_max_inline = 65536;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
 atomic_t rdma_stat_recv;
 atomic_t rdma_stat_read;
@@ -247,8 +247,6 @@ int svc_rdma_init(void)
        dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
        dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
        dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
-        dprintk("\tsq_depth         : %u\n",
-                svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
        dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
        dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index ff1df40f0d26..c676ed0efb5a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -12,7 +12,17 @@
 #undef SVCRDMA_BACKCHANNEL_DEBUG
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ *      %0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ *      %-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
                             struct xdr_buf *rcvbuf)
 {
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
        p = (__be32 *)src->iov_base;
        len = src->iov_len;
-        xid = rmsgp->rm_xid;
+        xid = *rdma_resp;
 #ifdef SVCRDMA_BACKCHANNEL_DEBUG
        pr_info("%s: xid=%08x, length=%zu\n",
                __func__, be32_to_cpu(xid), len);
        pr_info("%s: RPC/RDMA: %*ph\n",
-                __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+                __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
        pr_info("%s:      RPC: %*ph\n",
                __func__, (int)len, p);
 #endif
@@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
                goto out_unlock;
        memcpy(dst->iov_base, p, len);
-        credits = be32_to_cpu(rmsgp->rm_credit);
+        credits = be32_to_cpup(rdma_resp + 2);
        if (credits == 0)
                credits = 1;    /* don't deadlock */
        else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
@@ -90,9 +100,9 @@ out_notfound:
 * Caller holds the connection's mutex and has already marshaled
 * the RPC/RDMA request.
 *
- * This is similar to svc_rdma_reply, but takes an rpc_rqst
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
- * instead, does not support chunks, and avoids blocking memory
+ * rpc_rqst instead, does not support chunks, and avoids blocking
- * allocation.
+ * memory allocation.
 *
 * XXX: There is still an opportunity to block in svc_rdma_send()
 * if there are no SQ entries to post the Send. This may occur if
@@ -101,59 +111,36 @@ out_notfound:
 static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
                              struct rpc_rqst *rqst)
 {
-        struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
        struct svc_rdma_op_ctxt *ctxt;
-        struct svc_rdma_req_map *vec;
-        struct ib_send_wr send_wr;
        int ret;
-        vec = svc_rdma_get_req_map(rdma);
+        ctxt = svc_rdma_get_context(rdma);
-        ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
-        if (ret)
+        /* rpcrdma_bc_send_request builds the transport header and
+         * the backchannel RPC message in the same buffer. Thus only
+         * one SGE is needed to send both.
+         */
+        ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+                                     rqst->rq_snd_buf.len);
+        if (ret < 0)
                goto out_err;
        ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
        if (ret)
                goto out_err;
-        ctxt = svc_rdma_get_context(rdma);
+        ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
-        ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
+        if (ret)
-        ctxt->count = 1;
-        ctxt->direction = DMA_TO_DEVICE;
-        ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-        ctxt->sge[0].length = sndbuf->len;
-        ctxt->sge[0].addr =
-            ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
-                            sndbuf->len, DMA_TO_DEVICE);
-        if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
-                ret = -EIO;
-                goto out_unmap;
-        }
-        svc_rdma_count_mappings(rdma, ctxt);
-        memset(&send_wr, 0, sizeof(send_wr));
-        ctxt->cqe.done = svc_rdma_wc_send;
-        send_wr.wr_cqe = &ctxt->cqe;
-        send_wr.sg_list = ctxt->sge;
-        send_wr.num_sge = 1;
-        send_wr.opcode = IB_WR_SEND;
-        send_wr.send_flags = IB_SEND_SIGNALED;
-        ret = svc_rdma_send(rdma, &send_wr);
-        if (ret) {
-                ret = -EIO;
                goto out_unmap;
-        }
 out_err:
-        svc_rdma_put_req_map(rdma, vec);
        dprintk("svcrdma: %s returns %d\n", __func__, ret);
        return ret;
 out_unmap:
        svc_rdma_unmap_dma(ctxt);
        svc_rdma_put_context(ctxt, 1);
+        ret = -EIO;
        goto out_err;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 1c4aabf0f657..bdcf7d85a3dc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -166,92 +166,3 @@ out_inval:
        dprintk("svcrdma: failed to parse transport header\n");
        return -EINVAL;
 }
-int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
-                              struct rpcrdma_msg *rmsgp,
-                              enum rpcrdma_errcode err, __be32 *va)
-{
-        __be32 *startp = va;
-        *va++ = rmsgp->rm_xid;
-        *va++ = rmsgp->rm_vers;
-        *va++ = xprt->sc_fc_credits;
-        *va++ = rdma_error;
-        *va++ = cpu_to_be32(err);
-        if (err == ERR_VERS) {
-                *va++ = rpcrdma_version;
-                *va++ = rpcrdma_version;
-        }
-        return (int)((unsigned long)va - (unsigned long)startp);
-}
-/**
- * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
- * @rdma_resp: buffer containing Reply transport header
- *
- * Returns length of transport header, in bytes.
- */
-unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
-{
-        unsigned int nsegs;
-        __be32 *p;
-        p = rdma_resp;
-        /* RPC-over-RDMA V1 replies never have a Read list. */
-        p += rpcrdma_fixed_maxsz + 1;
-        /* Skip Write list. */
-        while (*p++ != xdr_zero) {
-                nsegs = be32_to_cpup(p++);
-                p += nsegs * rpcrdma_segment_maxsz;
-        }
-        /* Skip Reply chunk. */
-        if (*p++ != xdr_zero) {
-                nsegs = be32_to_cpup(p++);
-                p += nsegs * rpcrdma_segment_maxsz;
-        }
-        return (unsigned long)p - (unsigned long)rdma_resp;
-}
-void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
-{
-        struct rpcrdma_write_array *ary;
-        /* no read-list */
-        rmsgp->rm_body.rm_chunks[0] = xdr_zero;
-        /* write-array discrim */
-        ary = (struct rpcrdma_write_array *)
-                &rmsgp->rm_body.rm_chunks[1];
-        ary->wc_discrim = xdr_one;
-        ary->wc_nchunks = cpu_to_be32(chunks);
-        /* write-list terminator */
-        ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
-        /* reply-array discriminator */
-        ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
-}
-void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
-                                 int chunks)
-{
-        ary->wc_discrim = xdr_one;
-        ary->wc_nchunks = cpu_to_be32(chunks);
-}
-void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
-                                     int chunk_no,
-                                     __be32 rs_handle,
-                                     __be64 rs_offset,
-                                     u32 write_len)
-{
-        struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
-        seg->rs_handle = rs_handle;
-        seg->rs_offset = rs_offset;
-        seg->rs_length = cpu_to_be32(write_len);
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f7b2daf72a86..27a99bf5b1a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
        rqstp->rq_arg.buflen = head->arg.buflen;
 }
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+                                __be32 *rdma_argp, int status)
+{
+        struct svc_rdma_op_ctxt *ctxt;
+        __be32 *p, *err_msgp;
+        unsigned int length;
+        struct page *page;
+        int ret;
+        ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+        if (ret)
+                return;
+        page = alloc_page(GFP_KERNEL);
+        if (!page)
+                return;
+        err_msgp = page_address(page);
+        p = err_msgp;
+        *p++ = *rdma_argp;
+        *p++ = *(rdma_argp + 1);
+        *p++ = xprt->sc_fc_credits;
+        *p++ = rdma_error;
+        if (status == -EPROTONOSUPPORT) {
+                *p++ = err_vers;
+                *p++ = rpcrdma_version;
+                *p++ = rpcrdma_version;
+        } else {
+                *p++ = err_chunk;
+        }
+        length = (unsigned long)p - (unsigned long)err_msgp;
+        /* Map transport header; no RPC message payload */
+        ctxt = svc_rdma_get_context(xprt);
+        ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+        if (ret) {
+                dprintk("svcrdma: Error %d mapping send for protocol error\n",
+                        ret);
+                return;
+        }
+        ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+        if (ret) {
+                dprintk("svcrdma: Error %d posting send for protocol error\n",
+                        ret);
+                svc_rdma_unmap_dma(ctxt);
+                svc_rdma_put_context(ctxt, 1);
+        }
+}
 /* By convention, backchannel calls arrive via rdma_msg type
 * messages, and never populate the chunk lists. This makes
 * the RPC/RDMA header small and fixed in size, so it is
 * straightforward to check the RPC header's direction field.
 */
-static bool
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
-svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+                                          __be32 *rdma_resp)
 {
-        __be32 *p = (__be32 *)rmsgp;
+        __be32 *p;
        if (!xprt->xpt_bc_xprt)
                return false;
-        if (rmsgp->rm_type != rdma_msg)
+        p = rdma_resp + 3;
+        if (*p++ != rdma_msg)
                return false;
-        if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+        if (*p++ != xdr_zero)
                return false;
-        if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+        if (*p++ != xdr_zero)
                return false;
-        if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+        if (*p++ != xdr_zero)
                return false;
-        /* sanity */
+        /* XID sanity */
-        if (p[7] != rmsgp->rm_xid)
+        if (*p++ != *rdma_resp)
                return false;
        /* call direction */
-        if (p[8] == cpu_to_be32(RPC_CALL))
+        if (*p == cpu_to_be32(RPC_CALL))
                return false;
        return true;
@@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                goto out_drop;
        rqstp->rq_xprt_hlen = ret;
-        if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
+        if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
-                ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+                ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
+                                               &rmsgp->rm_xid,
                                               &rqstp->rq_arg);
                svc_rdma_put_context(ctxt, 0);
                if (ret)
@@ -686,7 +739,7 @@ complete:
        return ret;
 out_err:
-        svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+        svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
        svc_rdma_put_context(ctxt, 0);
        return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 000000000000..0cf620277693
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+#include <rdma/rw.h>
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+        struct list_head        rw_list;
+        struct rdma_rw_ctx      rw_ctx;
+        int                     rw_nents;
+        struct sg_table         rw_sg_table;
+        struct scatterlist      rw_first_sgl[0];
+};
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+        return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+                                        rw_list);
+}
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+        struct svc_rdma_rw_ctxt *ctxt;
+        spin_lock(&rdma->sc_rw_ctxt_lock);
+        ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+        if (ctxt) {
+                list_del(&ctxt->rw_list);
+                spin_unlock(&rdma->sc_rw_ctxt_lock);
+        } else {
+                spin_unlock(&rdma->sc_rw_ctxt_lock);
+                ctxt = kmalloc(sizeof(*ctxt) +
+                               SG_CHUNK_SIZE * sizeof(struct scatterlist),
+                               GFP_KERNEL);
+                if (!ctxt)
+                        goto out;
+                INIT_LIST_HEAD(&ctxt->rw_list);
+        }
+        ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+        if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+                                   ctxt->rw_sg_table.sgl)) {
+                kfree(ctxt);
+                ctxt = NULL;
+        }
+out:
+        return ctxt;
+}
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+                                 struct svc_rdma_rw_ctxt *ctxt)
+{
+        sg_free_table_chained(&ctxt->rw_sg_table, true);
+        spin_lock(&rdma->sc_rw_ctxt_lock);
+        list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+        spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+        struct svc_rdma_rw_ctxt *ctxt;
+        while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+                list_del(&ctxt->rw_list);
+                kfree(ctxt);
+        }
+}
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+        struct ib_cqe           cc_cqe;
+        struct svcxprt_rdma     *cc_rdma;
+        struct list_head        cc_rwctxts;
+        int                     cc_sqecount;
+        enum dma_data_direction cc_dir;
+};
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+                             struct svc_rdma_chunk_ctxt *cc,
+                             enum dma_data_direction dir)
+{
+        cc->cc_rdma = rdma;
+        svc_xprt_get(&rdma->sc_xprt);
+        INIT_LIST_HEAD(&cc->cc_rwctxts);
+        cc->cc_sqecount = 0;
+        cc->cc_dir = dir;
+}
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
+{
+        struct svcxprt_rdma *rdma = cc->cc_rdma;
+        struct svc_rdma_rw_ctxt *ctxt;
+        while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+                list_del(&ctxt->rw_list);
+                rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+                                    rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+                                    ctxt->rw_nents, cc->cc_dir);
+                svc_rdma_put_rw_ctxt(rdma, ctxt);
+        }
+        svc_xprt_put(&rdma->sc_xprt);
+}
+/* State for sending a Write or Reply chunk.
+ *  - Tracks progress of writing one chunk over all its segments
+ *  - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+        /* write state of this chunk */
+        unsigned int            wi_seg_off;
+        unsigned int            wi_seg_no;
+        unsigned int            wi_nsegs;
+        __be32                  *wi_segs;
+        /* SGL constructor arguments */
+        struct xdr_buf          *wi_xdr;
+        unsigned char           *wi_base;
+        unsigned int            wi_next_off;
+        struct svc_rdma_chunk_ctxt      wi_cc;
+};
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+        struct svc_rdma_write_info *info;
+        info = kmalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                return info;
+        info->wi_seg_off = 0;
+        info->wi_seg_no = 0;
+        info->wi_nsegs = be32_to_cpup(++chunk);
+        info->wi_segs = ++chunk;
+        svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
+        return info;
+}
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+        svc_rdma_cc_release(&info->wi_cc);
+        kfree(info);
+}
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+        struct ib_cqe *cqe = wc->wr_cqe;
+        struct svc_rdma_chunk_ctxt *cc =
+                        container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+        struct svcxprt_rdma *rdma = cc->cc_rdma;
+        struct svc_rdma_write_info *info =
+                        container_of(cc, struct svc_rdma_write_info, wi_cc);
+        atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+        wake_up(&rdma->sc_send_wait);
+        if (unlikely(wc->status != IB_WC_SUCCESS)) {
+                set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+                if (wc->status != IB_WC_WR_FLUSH_ERR)
+                        pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+                               ib_wc_status_msg(wc->status),
+                               wc->status, wc->vendor_err);
+        }
+        svc_rdma_write_info_free(info);
+}
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ *   even if one or more WRs are flushed. This is true when posting
+ *   an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+        struct svcxprt_rdma *rdma = cc->cc_rdma;
+        struct svc_xprt *xprt = &rdma->sc_xprt;
+        struct ib_send_wr *first_wr, *bad_wr;
+        struct list_head *tmp;
+        struct ib_cqe *cqe;
+        int ret;
+        first_wr = NULL;
+        cqe = &cc->cc_cqe;
+        list_for_each(tmp, &cc->cc_rwctxts) {
+                struct svc_rdma_rw_ctxt *ctxt;
+                ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+                first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+                                           rdma->sc_port_num, cqe, first_wr);
+                cqe = NULL;
+        }
+        do {
+                if (atomic_sub_return(cc->cc_sqecount,
+                                      &rdma->sc_sq_avail) > 0) {
+                        ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+                        if (ret)
+                                break;
+                        return 0;
+                }
+                atomic_inc(&rdma_stat_sq_starve);
+                atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+                wait_event(rdma->sc_send_wait,
+                           atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+        } while (1);
+        pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+        set_bit(XPT_CLOSE, &xprt->xpt_flags);
+        /* If even one was posted, there will be a completion. */
+        if (bad_wr != first_wr)
+                return 0;
+        atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+        wake_up(&rdma->sc_send_wait);
+        return -ENOTCONN;
+}
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+                               unsigned int len,
+                               struct svc_rdma_rw_ctxt *ctxt)
+{
+        struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+        sg_set_buf(&sg[0], info->wi_base, len);
+        info->wi_base += len;
+        ctxt->rw_nents = 1;
+}
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+                                    unsigned int remaining,
+                                    struct svc_rdma_rw_ctxt *ctxt)
+{
+        unsigned int sge_no, sge_bytes, page_off, page_no;
+        struct xdr_buf *xdr = info->wi_xdr;
+        struct scatterlist *sg;
+        struct page **page;
+        page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
+        page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
+        page = xdr->pages + page_no;
+        info->wi_next_off += remaining;
+        sg = ctxt->rw_sg_table.sgl;
+        sge_no = 0;
+        do {
+                sge_bytes = min_t(unsigned int, remaining,
+                                  PAGE_SIZE - page_off);
+                sg_set_page(sg, *page, sge_bytes, page_off);
+                remaining -= sge_bytes;
+                sg = sg_next(sg);
+                page_off = 0;
+                sge_no++;
+                page++;
+        } while (remaining);
+        ctxt->rw_nents = sge_no;
+}
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+                      void (*constructor)(struct svc_rdma_write_info *info,
+                                          unsigned int len,
+                                          struct svc_rdma_rw_ctxt *ctxt),
+                      unsigned int remaining)
+{
+        struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+        struct svcxprt_rdma *rdma = cc->cc_rdma;
+        struct svc_rdma_rw_ctxt *ctxt;
+        __be32 *seg;
+        int ret;
+        cc->cc_cqe.done = svc_rdma_write_done;
+        seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+        do {
+                unsigned int write_len;
+                u32 seg_length, seg_handle;
+                u64 seg_offset;
+                if (info->wi_seg_no >= info->wi_nsegs)
+                        goto out_overflow;
+                seg_handle = be32_to_cpup(seg);
+                seg_length = be32_to_cpup(seg + 1);
+                xdr_decode_hyper(seg + 2, &seg_offset);
+                seg_offset += info->wi_seg_off;
+                write_len = min(remaining, seg_length - info->wi_seg_off);
+                ctxt = svc_rdma_get_rw_ctxt(rdma,
+                                            (write_len >> PAGE_SHIFT) + 2);
+                if (!ctxt)
+                        goto out_noctx;
+                constructor(info, write_len, ctxt);
+                ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+                                       rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+                                       ctxt->rw_nents, 0, seg_offset,
+                                       seg_handle, DMA_TO_DEVICE);
+                if (ret < 0)
+                        goto out_initerr;
+                list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+                cc->cc_sqecount += ret;
+                if (write_len == seg_length - info->wi_seg_off) {
+                        seg += 4;
+                        info->wi_seg_no++;
+                        info->wi_seg_off = 0;
+                } else {
+                        info->wi_seg_off += write_len;
+                }
+                remaining -= write_len;
+        } while (remaining);
+        return 0;
+out_overflow:
+        dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+                info->wi_nsegs);
+        return -E2BIG;
+out_noctx:
+        dprintk("svcrdma: no R/W ctxs available\n");
+        return -ENOMEM;
+out_initerr:
+        svc_rdma_put_rw_ctxt(rdma, ctxt);
+        pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+        return -EIO;
+}
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+                                  struct kvec *vec)
+{
+        info->wi_base = vec->iov_base;
+        return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+                                     vec->iov_len);
+}
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+                                      struct xdr_buf *xdr)
+{
+        info->wi_xdr = xdr;
+        info->wi_next_off = 0;
+        return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+                                     xdr->page_len);
+}
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *      %-E2BIG if the payload was larger than the Write chunk,
+ *      %-ENOMEM if rdma_rw context pool was exhausted,
+ *      %-ENOTCONN if posting failed (connection is lost),
+ *      %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+                              struct xdr_buf *xdr)
+{
+        struct svc_rdma_write_info *info;
+        int ret;
+        if (!xdr->page_len)
+                return 0;
+        info = svc_rdma_write_info_alloc(rdma, wr_ch);
+        if (!info)
+                return -ENOMEM;
+        ret = svc_rdma_send_xdr_pagelist(info, xdr);
+        if (ret < 0)
+                goto out_err;
+        ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+        if (ret < 0)
+                goto out_err;
+        return xdr->page_len;
+out_err:
+        svc_rdma_write_info_free(info);
+        return ret;
+}
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *      %-E2BIG if the payload was larger than the Reply chunk,
+ *      %-ENOMEM if rdma_rw context pool was exhausted,
+ *      %-ENOTCONN if posting failed (connection is lost),
+ *      %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+                              bool writelist, struct xdr_buf *xdr)
+{
+        struct svc_rdma_write_info *info;
+        int consumed, ret;
+        info = svc_rdma_write_info_alloc(rdma, rp_ch);
+        if (!info)
+                return -ENOMEM;
+        ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+        if (ret < 0)
+                goto out_err;
+        consumed = xdr->head[0].iov_len;
+        /* Send the page list in the Reply chunk only if the
+         * client did not provide Write chunks.
+         */
+        if (!writelist && xdr->page_len) {
+                ret = svc_rdma_send_xdr_pagelist(info, xdr);
+                if (ret < 0)
+                        goto out_err;
+                consumed += xdr->page_len;
+        }
+        if (xdr->tail[0].iov_len) {
+                ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+                if (ret < 0)
+                        goto out_err;
+                consumed += xdr->tail[0].iov_len;
+        }
+        ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+        if (ret < 0)
+                goto out_err;
+        return consumed;
+out_err:
+        svc_rdma_write_info_free(info);
+        return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 515221b16d09..1736337f3a55 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2016 Oracle. All rights reserved.
 * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
 *
@@ -40,6 +41,63 @@
 * Author: Tom Tucker <tom@opengridcomputing.com>
 */
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ *   successfully, or get flushed. Either way, the Send completion
+ *   handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ *   the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/spinlock.h>
@@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len)
        return (len & 3) ? (4 - (len & 3)) : 0;
 }
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
+/* Returns length of transport header, in bytes.
-                     struct xdr_buf *xdr,
+ */
-                     struct svc_rdma_req_map *vec,
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
-                     bool write_chunk_present)
 {
-        int sge_no;
+        unsigned int nsegs;
-        u32 sge_bytes;
+        __be32 *p;
-        u32 page_bytes;
-        u32 page_off;
-        int page_no;
-        if (xdr->len !=
-            (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
-                pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
-                return -EIO;
-        }
-        /* Skip the first sge, this is for the RPCRDMA header */
+        p = rdma_resp;
-        sge_no = 1;
+        /* RPC-over-RDMA V1 replies never have a Read list. */
+        p += rpcrdma_fixed_maxsz + 1;
-        /* Head SGE */
+        /* Skip Write list. */
-        vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+        while (*p++ != xdr_zero) {
-        vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+                nsegs = be32_to_cpup(p++);
-        sge_no++;
+                p += nsegs * rpcrdma_segment_maxsz;
-        /* pages SGE */
-        page_no = 0;
-        page_bytes = xdr->page_len;
-        page_off = xdr->page_base;
-        while (page_bytes) {
-                vec->sge[sge_no].iov_base =
-                        page_address(xdr->pages[page_no]) + page_off;
-                sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
-                page_bytes -= sge_bytes;
-                vec->sge[sge_no].iov_len = sge_bytes;
-                sge_no++;
-                page_no++;
-                page_off = 0; /* reset for next time through loop */
        }
-        /* Tail SGE */
+        /* Skip Reply chunk. */
-        if (xdr->tail[0].iov_len) {
+        if (*p++ != xdr_zero) {
-                unsigned char *base = xdr->tail[0].iov_base;
+                nsegs = be32_to_cpup(p++);
-                size_t len = xdr->tail[0].iov_len;
+                p += nsegs * rpcrdma_segment_maxsz;
-                u32 xdr_pad = xdr_padsize(xdr->page_len);
+        }
-                if (write_chunk_present && xdr_pad) {
+        return (unsigned long)p - (unsigned long)rdma_resp;
-                        base += xdr_pad;
+}
-                        len -= xdr_pad;
-                }
-                if (len) {
+/* One Write chunk is copied from Call transport header to Reply
-                        vec->sge[sge_no].iov_base = base;
+ * transport header. Each segment's length field is updated to
-                        vec->sge[sge_no].iov_len = len;
+ * reflect number of bytes consumed in the segment.
-                        sge_no++;
+ *
+ * Returns number of segments in this chunk.
+ */
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+                                           unsigned int remaining)
+{
+        unsigned int i, nsegs;
+        u32 seg_len;
+        /* Write list discriminator */
+        *dst++ = *src++;
+        /* number of segments in this chunk */
+        nsegs = be32_to_cpup(src);
+        *dst++ = *src++;
+        for (i = nsegs; i; i--) {
+                /* segment's RDMA handle */
+                *dst++ = *src++;
+                /* bytes returned in this segment */
+                seg_len = be32_to_cpu(*src);
+                if (remaining >= seg_len) {
+                        /* entire segment was consumed */
+                        *dst = *src;
+                        remaining -= seg_len;
+                } else {
+                        /* segment only partly filled */
+                        *dst = cpu_to_be32(remaining);
+                        remaining = 0;
                }
-        }
+                dst++; src++;
-        dprintk("svcrdma: %s: sge_no %d page_no %d "
+                /* segment's RDMA offset */
-                "page_base %u page_len %u head_len %zu tail_len %zu\n",
+                *dst++ = *src++;
-                __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
+                *dst++ = *src++;
-                xdr->head[0].iov_len, xdr->tail[0].iov_len);
+        }
-        vec->count = sge_no;
+        return nsegs;
-        return 0;
 }
-static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+/* The client provided a Write list in the Call message. Fill in
-                              struct xdr_buf *xdr,
+ * the segments in the first Write chunk in the Reply's transport
-                              u32 xdr_off, size_t len, int dir)
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ *  - Client has provided only one Write chunk
+ */
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
+                                           unsigned int consumed)
 {
-        struct page *page;
+        unsigned int nsegs;
-        dma_addr_t dma_addr;
+        __be32 *p, *q;
-        if (xdr_off < xdr->head[0].iov_len) {
-                /* This offset is in the head */
+        /* RPC-over-RDMA V1 replies never have a Read list. */
-                xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+        p = rdma_resp + rpcrdma_fixed_maxsz + 1;
-                page = virt_to_page(xdr->head[0].iov_base);
-        } else {
+        q = wr_ch;
-                xdr_off -= xdr->head[0].iov_len;
+        while (*q != xdr_zero) {
-                if (xdr_off < xdr->page_len) {
+                nsegs = xdr_encode_write_chunk(p, q, consumed);
-                        /* This offset is in the page list */
+                q += 2 + nsegs * rpcrdma_segment_maxsz;
-                        xdr_off += xdr->page_base;
+                p += 2 + nsegs * rpcrdma_segment_maxsz;
-                        page = xdr->pages[xdr_off >> PAGE_SHIFT];
+                consumed = 0;
-                        xdr_off &= ~PAGE_MASK;
-                } else {
-                        /* This offset is in the tail */
-                        xdr_off -= xdr->page_len;
-                        xdr_off += (unsigned long)
-                                xdr->tail[0].iov_base & ~PAGE_MASK;
-                        page = virt_to_page(xdr->tail[0].iov_base);
-                }
        }
-        dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
-                                   min_t(size_t, PAGE_SIZE, len), dir);
+        /* Terminate Write list */
-        return dma_addr;
+        *p++ = xdr_zero;
+        /* Reply chunk discriminator; may be replaced later */
+        *p = xdr_zero;
+}
+/* The client provided a Reply chunk in the Call message. Fill in
+ * the segments in the Reply chunk in the Reply message with the
+ * number of bytes consumed in each segment.
+ *
+ * Assumptions:
+ * - Reply can always fit in the provided Reply chunk
+ */
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
+                                            unsigned int consumed)
+{
+        __be32 *p;
+        /* Find the Reply chunk in the Reply's xprt header.
+         * RPC-over-RDMA V1 replies never have a Read list.
+         */
+        p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+        /* Skip past Write list */
+        while (*p++ != xdr_zero)
+                p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+        xdr_encode_write_chunk(p, rp_ch, consumed);
 }
 /* Parse the RPC Call's transport header.
 */
-static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
-                                      struct rpcrdma_write_array **write,
+                                      __be32 **write, __be32 **reply)
-                                      struct rpcrdma_write_array **reply)
 {
        __be32 *p;
-        p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
+        p = rdma_argp + rpcrdma_fixed_maxsz;
        /* Read list */
        while (*p++ != xdr_zero)
@@ -169,7 +255,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
        /* Write list */
        if (*p != xdr_zero) {
-                *write = (struct rpcrdma_write_array *)p;
+                *write = p;
                while (*p++ != xdr_zero)
                        p += 1 + be32_to_cpu(*p) * 4;
        } else {
@@ -179,7 +265,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
        /* Reply chunk */
        if (*p != xdr_zero)
-                *reply = (struct rpcrdma_write_array *)p;
+                *reply = p;
        else
                *reply = NULL;
 }
@@ -189,360 +275,321 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
 * Invalidate, and responder chooses one rkey to invalidate.
 *
 * Find a candidate rkey to invalidate when sending a reply.  Picks the
- * first rkey it finds in the chunks lists.
+ * first R_key it finds in the chunk lists.
 *
 * Returns zero if RPC's chunk lists are empty.
 */
-static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
+static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
-                                 struct rpcrdma_write_array *wr_ary,
+                                 __be32 *wr_lst, __be32 *rp_ch)
-                                 struct rpcrdma_write_array *rp_ary)
 {
-        struct rpcrdma_read_chunk *rd_ary;
+        __be32 *p;
-        struct rpcrdma_segment *arg_ch;
-        rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
+        p = rdma_argp + rpcrdma_fixed_maxsz;
-        if (rd_ary->rc_discrim != xdr_zero)
+        if (*p != xdr_zero)
-                return be32_to_cpu(rd_ary->rc_target.rs_handle);
+                p += 2;
+        else if (wr_lst && be32_to_cpup(wr_lst + 1))
+                p = wr_lst + 2;
+        else if (rp_ch && be32_to_cpup(rp_ch + 1))
+                p = rp_ch + 2;
+        else
+                return 0;
+        return be32_to_cpup(p);
+}
-        if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
-                arg_ch = &wr_ary->wc_array[0].wc_target;
+ * is used during completion to DMA-unmap this memory, and
-                return be32_to_cpu(arg_ch->rs_handle);
+ * it uses ib_dma_unmap_page() exclusively.
-        }
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+                                struct svc_rdma_op_ctxt *ctxt,
+                                unsigned int sge_no,
+                                unsigned char *base,
+                                unsigned int len)
+{
+        unsigned long offset = (unsigned long)base & ~PAGE_MASK;
+        struct ib_device *dev = rdma->sc_cm_id->device;
+        dma_addr_t dma_addr;
-        if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
+        dma_addr = ib_dma_map_page(dev, virt_to_page(base),
-                arg_ch = &rp_ary->wc_array[0].wc_target;
+                                   offset, len, DMA_TO_DEVICE);
-                return be32_to_cpu(arg_ch->rs_handle);
+        if (ib_dma_mapping_error(dev, dma_addr))
-        }
+                return -EIO;
+        ctxt->sge[sge_no].addr = dma_addr;
+        ctxt->sge[sge_no].length = len;
+        ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+        svc_rdma_count_mappings(rdma, ctxt);
        return 0;
 }
-/* Assumptions:
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
- * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+                                 struct svc_rdma_op_ctxt *ctxt,
- */
+                                 unsigned int sge_no,
-static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+                                 struct page *page,
-                      u32 rmr, u64 to,
+                                 unsigned int offset,
-                      u32 xdr_off, int write_len,
+                                 unsigned int len)
-                      struct svc_rdma_req_map *vec)
 {
-        struct ib_rdma_wr write_wr;
+        struct ib_device *dev = rdma->sc_cm_id->device;
-        struct ib_sge *sge;
+        dma_addr_t dma_addr;
-        int xdr_sge_no;
-        int sge_no;
-        int sge_bytes;
-        int sge_off;
-        int bc;
-        struct svc_rdma_op_ctxt *ctxt;
-        if (vec->count > RPCSVC_MAXPAGES) {
+        dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
-                pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
+        if (ib_dma_mapping_error(dev, dma_addr))
                return -EIO;
-        }
-        dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
+        ctxt->sge[sge_no].addr = dma_addr;
-                "write_len=%d, vec->sge=%p, vec->count=%lu\n",
+        ctxt->sge[sge_no].length = len;
-                rmr, (unsigned long long)to, xdr_off,
+        ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-                write_len, vec->sge, vec->count);
+        svc_rdma_count_mappings(rdma, ctxt);
+        return 0;
+}
-        ctxt = svc_rdma_get_context(xprt);
+/**
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for the Send WR
+ * @rdma_resp: buffer containing transport header
+ * @len: length of transport header
+ *
+ * Returns:
+ *      %0 if the header is DMA mapped,
+ *      %-EIO if DMA mapping failed.
+ */
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+                           struct svc_rdma_op_ctxt *ctxt,
+                           __be32 *rdma_resp,
+                           unsigned int len)
+{
        ctxt->direction = DMA_TO_DEVICE;
-        sge = ctxt->sge;
+        ctxt->pages[0] = virt_to_page(rdma_resp);
+        ctxt->count = 1;
-        /* Find the SGE associated with xdr_off */
+        return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
-        for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
-             xdr_sge_no++) {
-                if (vec->sge[xdr_sge_no].iov_len > bc)
-                        break;
-                bc -= vec->sge[xdr_sge_no].iov_len;
-        }
-        sge_off = bc;
-        bc = write_len;
-        sge_no = 0;
-        /* Copy the remaining SGE */
-        while (bc != 0) {
-                sge_bytes = min_t(size_t,
-                          bc, vec->sge[xdr_sge_no].iov_len-sge_off);
-                sge[sge_no].length = sge_bytes;
-                sge[sge_no].addr =
-                        dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
-                                    sge_bytes, DMA_TO_DEVICE);
-                xdr_off += sge_bytes;
-                if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-                                         sge[sge_no].addr))
-                        goto err;
-                svc_rdma_count_mappings(xprt, ctxt);
-                sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
-                ctxt->count++;
-                sge_off = 0;
-                sge_no++;
-                xdr_sge_no++;
-                if (xdr_sge_no > vec->count) {
-                        pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
-                        goto err;
-                }
-                bc -= sge_bytes;
-                if (sge_no == xprt->sc_max_sge)
-                        break;
-        }
-        /* Prepare WRITE WR */
-        memset(&write_wr, 0, sizeof write_wr);
-        ctxt->cqe.done = svc_rdma_wc_write;
-        write_wr.wr.wr_cqe = &ctxt->cqe;
-        write_wr.wr.sg_list = &sge[0];
-        write_wr.wr.num_sge = sge_no;
-        write_wr.wr.opcode = IB_WR_RDMA_WRITE;
-        write_wr.wr.send_flags = IB_SEND_SIGNALED;
-        write_wr.rkey = rmr;
-        write_wr.remote_addr = to;
-        /* Post It */
-        atomic_inc(&rdma_stat_write);
-        if (svc_rdma_send(xprt, &write_wr.wr))
-                goto err;
-        return write_len - bc;
- err:
-        svc_rdma_unmap_dma(ctxt);
-        svc_rdma_put_context(ctxt, 0);
-        return -EIO;
 }
-noinline
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
-static int send_write_chunks(struct svcxprt_rdma *xprt,
+ * element as it is added.
-                             struct rpcrdma_write_array *wr_ary,
+ *
-                             struct rpcrdma_msg *rdma_resp,
+ * Returns the number of sge elements loaded on success, or
-                             struct svc_rqst *rqstp,
+ * a negative errno on failure.
-                             struct svc_rdma_req_map *vec)
+ */
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+                                  struct svc_rdma_op_ctxt *ctxt,
+                                  struct xdr_buf *xdr, __be32 *wr_lst)
 {
-        u32 xfer_len = rqstp->rq_res.page_len;
+        unsigned int len, sge_no, remaining, page_off;
-        int write_len;
+        struct page **ppages;
-        u32 xdr_off;
+        unsigned char *base;
-        int chunk_off;
+        u32 xdr_pad;
-        int chunk_no;
-        int nchunks;
-        struct rpcrdma_write_array *res_ary;
        int ret;
-        res_ary = (struct rpcrdma_write_array *)
+        sge_no = 1;
-                &rdma_resp->rm_body.rm_chunks[1];
+        ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
-        /* Write chunks start at the pagelist */
+                                   xdr->head[0].iov_base,
-        nchunks = be32_to_cpu(wr_ary->wc_nchunks);
+                                   xdr->head[0].iov_len);
-        for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+        if (ret < 0)
-             xfer_len && chunk_no < nchunks;
+                return ret;
-             chunk_no++) {
-                struct rpcrdma_segment *arg_ch;
+        /* If a Write chunk is present, the xdr_buf's page list
-                u64 rs_offset;
+         * is not included inline. However the Upper Layer may
+         * have added XDR padding in the tail buffer, and that
-                arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
+         * should not be included inline.
-                write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
+         */
+        if (wr_lst) {
-                /* Prepare the response chunk given the length actually
+                base = xdr->tail[0].iov_base;
-                 * written */
+                len = xdr->tail[0].iov_len;
-                xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
+                xdr_pad = xdr_padsize(xdr->page_len);
-                svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
-                                                arg_ch->rs_handle,
+                if (len && xdr_pad) {
-                                                arg_ch->rs_offset,
+                        base += xdr_pad;
-                                                write_len);
+                        len -= xdr_pad;
-                chunk_off = 0;
-                while (write_len) {
-                        ret = send_write(xprt, rqstp,
-                                         be32_to_cpu(arg_ch->rs_handle),
-                                         rs_offset + chunk_off,
-                                         xdr_off,
-                                         write_len,
-                                         vec);
-                        if (ret <= 0)
-                                goto out_err;
-                        chunk_off += ret;
-                        xdr_off += ret;
-                        xfer_len -= ret;
-                        write_len -= ret;
                }
+                goto tail;
+        }
+        ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+        page_off = xdr->page_base & ~PAGE_MASK;
+        remaining = xdr->page_len;
+        while (remaining) {
+                len = min_t(u32, PAGE_SIZE - page_off, remaining);
+                ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
+                                            *ppages++, page_off, len);
+                if (ret < 0)
+                        return ret;
+                remaining -= len;
+                page_off = 0;
        }
-        /* Update the req with the number of chunks actually used */
-        svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
-        return rqstp->rq_res.page_len;
+        base = xdr->tail[0].iov_base;
+        len = xdr->tail[0].iov_len;
+tail:
+        if (len) {
+                ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+                if (ret < 0)
+                        return ret;
+        }
-out_err:
+        return sge_no - 1;
-        pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
-        return -EIO;
 }
-noinline
+/* The svc_rqst and all resources it owns are released as soon as
-static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
-                             struct rpcrdma_write_array *rp_ary,
+ * so they are released by the Send completion handler.
-                             struct rpcrdma_msg *rdma_resp,
+ */
-                             struct svc_rqst *rqstp,
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
-                             struct svc_rdma_req_map *vec)
+                                   struct svc_rdma_op_ctxt *ctxt)
 {
-        u32 xfer_len = rqstp->rq_res.len;
+        int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
-        int write_len;
-        u32 xdr_off;
-        int chunk_no;
-        int chunk_off;
-        int nchunks;
-        struct rpcrdma_segment *ch;
-        struct rpcrdma_write_array *res_ary;
-        int ret;
-        /* XXX: need to fix when reply lists occur with read-list and or
+        ctxt->count += pages;
-         * write-list */
+        for (i = 0; i < pages; i++) {
-        res_ary = (struct rpcrdma_write_array *)
+                ctxt->pages[i + 1] = rqstp->rq_respages[i];
-                &rdma_resp->rm_body.rm_chunks[2];
+                rqstp->rq_respages[i] = NULL;
-        /* xdr offset starts at RPC message */
-        nchunks = be32_to_cpu(rp_ary->wc_nchunks);
-        for (xdr_off = 0, chunk_no = 0;
-             xfer_len && chunk_no < nchunks;
-             chunk_no++) {
-                u64 rs_offset;
-                ch = &rp_ary->wc_array[chunk_no].wc_target;
-                write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
-                /* Prepare the reply chunk given the length actually
-                 * written */
-                xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
-                svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
-                                                ch->rs_handle, ch->rs_offset,
-                                                write_len);
-                chunk_off = 0;
-                while (write_len) {
-                        ret = send_write(xprt, rqstp,
-                                         be32_to_cpu(ch->rs_handle),
-                                         rs_offset + chunk_off,
-                                         xdr_off,
-                                         write_len,
-                                         vec);
-                        if (ret <= 0)
-                                goto out_err;
-                        chunk_off += ret;
-                        xdr_off += ret;
-                        xfer_len -= ret;
-                        write_len -= ret;
-                }
        }
-        /* Update the req with the number of chunks actually used */
+        rqstp->rq_next_page = rqstp->rq_respages + 1;
-        svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+}
-        return rqstp->rq_res.len;
+/**
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for transmitting the Send WR
+ * @num_sge: number of SGEs to send
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
+ *
+ * Returns:
+ *      %0 if the Send* was posted successfully,
+ *      %-ENOTCONN if the connection was lost or dropped,
+ *      %-EINVAL if there was a problem with the Send we built,
+ *      %-ENOMEM if ib_post_send failed.
+ */
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+                          struct svc_rdma_op_ctxt *ctxt, int num_sge,
+                          u32 inv_rkey)
+{
+        struct ib_send_wr *send_wr = &ctxt->send_wr;
-out_err:
+        dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
-        pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
-        return -EIO;
+        send_wr->next = NULL;
+        ctxt->cqe.done = svc_rdma_wc_send;
+        send_wr->wr_cqe = &ctxt->cqe;
+        send_wr->sg_list = ctxt->sge;
+        send_wr->num_sge = num_sge;
+        send_wr->send_flags = IB_SEND_SIGNALED;
+        if (inv_rkey) {
+                send_wr->opcode = IB_WR_SEND_WITH_INV;
+                send_wr->ex.invalidate_rkey = inv_rkey;
+        } else {
+                send_wr->opcode = IB_WR_SEND;
+        }
+        return svc_rdma_send(rdma, send_wr);
 }
-/* This function prepares the portion of the RPCRDMA message to be
+/* Prepare the portion of the RPC Reply that will be transmitted
- * sent in the RDMA_SEND. This function is called after data sent via
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
- * RDMA has already been transmitted. There are three cases:
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
- * - The RPCRDMA header, RPC header, and payload are all sent in a
+ *
- *   single RDMA_SEND. This is the "inline" case.
+ * Depending on whether a Write list or Reply chunk is present,
- * - The RPCRDMA header and some portion of the RPC header and data
+ * the server may send all, a portion of, or none of the xdr_buf.
- *   are sent via this RDMA_SEND and another portion of the data is
+ * In the latter case, only the transport header (sge[0]) is
- *   sent via RDMA.
+ * transmitted.
- * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
+ *
- *   header and data are all transmitted via RDMA.
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
- * In all three cases, this function prepares the RPCRDMA header in
+ * involved in the earlier RDMA Writes are here transferred out
- * sge[0], the 'type' parameter indicates the type to place in the
+ * of the rqstp and into the ctxt's page array. These pages are
- * RPCRDMA header, and the 'byte_count' field indicates how much of
+ * DMA unmapped by each Write completion, but the subsequent Send
- * the XDR to include in this RDMA_SEND. NB: The offset of the payload
+ * completion finally releases these pages.
- * to send is zero in the XDR.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
 */
-static int send_reply(struct svcxprt_rdma *rdma,
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
-                      struct svc_rqst *rqstp,
+                                   __be32 *rdma_argp, __be32 *rdma_resp,
-                      struct page *page,
+                                   struct svc_rqst *rqstp,
-                      struct rpcrdma_msg *rdma_resp,
+                                   __be32 *wr_lst, __be32 *rp_ch)
-                      struct svc_rdma_req_map *vec,
-                      int byte_count,
-                      u32 inv_rkey)
 {
        struct svc_rdma_op_ctxt *ctxt;
-        struct ib_send_wr send_wr;
+        u32 inv_rkey;
-        u32 xdr_off;
+        int ret;
-        int sge_no;
-        int sge_bytes;
+        dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
-        int page_no;
+                (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
-        int pages;
+                rqstp->rq_res.head[0].iov_len,
-        int ret = -EIO;
+                rqstp->rq_res.page_len,
+                rqstp->rq_res.tail[0].iov_len);
-        /* Prepare the context */
        ctxt = svc_rdma_get_context(rdma);
-        ctxt->direction = DMA_TO_DEVICE;
-        ctxt->pages[0] = page;
-        ctxt->count = 1;
-        /* Prepare the SGE for the RPCRDMA Header */
+        ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
-        ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
+                                     svc_rdma_reply_hdr_len(rdma_resp));
-        ctxt->sge[0].length =
+        if (ret < 0)
-            svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
-        ctxt->sge[0].addr =
-            ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
-                            ctxt->sge[0].length, DMA_TO_DEVICE);
-        if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
                goto err;
-        svc_rdma_count_mappings(rdma, ctxt);
-        ctxt->direction = DMA_TO_DEVICE;
-        /* Map the payload indicated by 'byte_count' */
+        if (!rp_ch) {
-        xdr_off = 0;
+                ret = svc_rdma_map_reply_msg(rdma, ctxt,
-        for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+                                             &rqstp->rq_res, wr_lst);
-                sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
+                if (ret < 0)
-                byte_count -= sge_bytes;
-                ctxt->sge[sge_no].addr =
-                        dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
-                                    sge_bytes, DMA_TO_DEVICE);
-                xdr_off += sge_bytes;
-                if (ib_dma_mapping_error(rdma->sc_cm_id->device,
-                                         ctxt->sge[sge_no].addr))
                        goto err;
-                svc_rdma_count_mappings(rdma, ctxt);
-                ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-                ctxt->sge[sge_no].length = sge_bytes;
        }
-        if (byte_count != 0) {
-                pr_err("svcrdma: Could not map %d bytes\n", byte_count);
+        svc_rdma_save_io_pages(rqstp, ctxt);
+        inv_rkey = 0;
+        if (rdma->sc_snd_w_inv)
+                inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+        ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
+        if (ret)
                goto err;
-        }
-        /* Save all respages in the ctxt and remove them from the
+        return 0;
-         * respages array. They are our pages until the I/O
-         * completes.
+err:
+        pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
+        svc_rdma_unmap_dma(ctxt);
+        svc_rdma_put_context(ctxt, 1);
+        return ret;
+}
+/* Given the client-provided Write and Reply chunks, the server was not
+ * able to form a complete reply. Return an RDMA_ERROR message so the
+ * client can retire this RPC transaction. As above, the Send completion
+ * routine releases payload pages that were part of a previous RDMA Write.
+ *
+ * Remote Invalidation is skipped for simplicity.
+ */
+static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+                                   __be32 *rdma_resp, struct svc_rqst *rqstp)
+{
+        struct svc_rdma_op_ctxt *ctxt;
+        __be32 *p;
+        int ret;
+        ctxt = svc_rdma_get_context(rdma);
+        /* Replace the original transport header with an
+         * RDMA_ERROR response. XID etc are preserved.
         */
-        pages = rqstp->rq_next_page - rqstp->rq_respages;
+        p = rdma_resp + 3;
-        for (page_no = 0; page_no < pages; page_no++) {
+        *p++ = rdma_error;
-                ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
+        *p   = err_chunk;
-                ctxt->count++;
-                rqstp->rq_respages[page_no] = NULL;
-        }
-        rqstp->rq_next_page = rqstp->rq_respages + 1;
-        if (sge_no > rdma->sc_max_sge) {
+        ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
-                pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+        if (ret < 0)
                goto err;
-        }
-        memset(&send_wr, 0, sizeof send_wr);
-        ctxt->cqe.done = svc_rdma_wc_send;
-        send_wr.wr_cqe = &ctxt->cqe;
-        send_wr.sg_list = ctxt->sge;
-        send_wr.num_sge = sge_no;
-        if (inv_rkey) {
-                send_wr.opcode = IB_WR_SEND_WITH_INV;
-                send_wr.ex.invalidate_rkey = inv_rkey;
-        } else
-                send_wr.opcode = IB_WR_SEND;
-        send_wr.send_flags =  IB_SEND_SIGNALED;
-        ret = svc_rdma_send(rdma, &send_wr);
+        svc_rdma_save_io_pages(rqstp, ctxt);
+        ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0);
        if (ret)
                goto err;
        return 0;
- err:
+err:
+        pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
        svc_rdma_unmap_dma(ctxt);
        svc_rdma_put_context(ctxt, 1);
        return ret;
@@ -552,39 +599,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
 {
 }
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ *      %0 if an RPC reply has been successfully posted,
+ *      %-ENOMEM if a resource shortage occurred (connection is lost),
+ *      %-ENOTCONN if posting failed (connection is lost).
+ */
 int svc_rdma_sendto(struct svc_rqst *rqstp)
 {
        struct svc_xprt *xprt = rqstp->rq_xprt;
        struct svcxprt_rdma *rdma =
                container_of(xprt, struct svcxprt_rdma, sc_xprt);
-        struct rpcrdma_msg *rdma_argp;
+        __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
-        struct rpcrdma_msg *rdma_resp;
+        struct xdr_buf *xdr = &rqstp->rq_res;
-        struct rpcrdma_write_array *wr_ary, *rp_ary;
-        int ret;
-        int inline_bytes;
        struct page *res_page;
-        struct svc_rdma_req_map *vec;
+        int ret;
-        u32 inv_rkey;
-        __be32 *p;
-        dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
-        /* Get the RDMA request header. The receive logic always
+        /* Find the call's chunk lists to decide how to send the reply.
-         * places this at the start of page 0.
+         * Receive places the Call's xprt header at the start of page 0.
         */
        rdma_argp = page_address(rqstp->rq_pages[0]);
-        svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
+        svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
-        inv_rkey = 0;
-        if (rdma->sc_snd_w_inv)
-                inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
-        /* Build an req vec for the XDR */
+        dprintk("svcrdma: preparing response for XID 0x%08x\n",
-        vec = svc_rdma_get_req_map(rdma);
+                be32_to_cpup(rdma_argp));
-        ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
-        if (ret)
-                goto err0;
-        inline_bytes = rqstp->rq_res.len;
        /* Create the RDMA response header. xprt->xpt_mutex,
         * acquired in svc_send(), serializes RPC replies. The
@@ -598,115 +642,57 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
                goto err0;
        rdma_resp = page_address(res_page);
-        p = &rdma_resp->rm_xid;
+        p = rdma_resp;
-        *p++ = rdma_argp->rm_xid;
+        *p++ = *rdma_argp;
-        *p++ = rdma_argp->rm_vers;
+        *p++ = *(rdma_argp + 1);
        *p++ = rdma->sc_fc_credits;
-        *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+        *p++ = rp_ch ? rdma_nomsg : rdma_msg;
        /* Start with empty chunks */
        *p++ = xdr_zero;
        *p++ = xdr_zero;
        *p   = xdr_zero;
-        /* Send any write-chunk data and build resp write-list */
+        if (wr_lst) {
-        if (wr_ary) {
+                /* XXX: Presume the client sent only one Write chunk */
-                ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
+                ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
                if (ret < 0)
-                        goto err1;
+                        goto err2;
-                inline_bytes -= ret + xdr_padsize(ret);
+                svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
        }
+        if (rp_ch) {
-        /* Send any reply-list data and update resp reply-list */
+                ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
-        if (rp_ary) {
-                ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
                if (ret < 0)
-                        goto err1;
+                        goto err2;
-                inline_bytes -= ret;
+                svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
        }
-        /* Post a fresh Receive buffer _before_ sending the reply */
        ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
        if (ret)
                goto err1;
+        ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
-        ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
+                                      wr_lst, rp_ch);
-                         inline_bytes, inv_rkey);
        if (ret < 0)
                goto err0;
+        return 0;
-        svc_rdma_put_req_map(rdma, vec);
+ err2:
-        dprintk("svcrdma: send_reply returns %d\n", ret);
+        if (ret != -E2BIG)
-        return ret;
+                goto err1;
+        ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+        if (ret)
+                goto err1;
+        ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
+        if (ret < 0)
+                goto err0;
+        return 0;
 err1:
        put_page(res_page);
 err0:
-        svc_rdma_put_req_map(rdma, vec);
        pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
               ret);
-        set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+        set_bit(XPT_CLOSE, &xprt->xpt_flags);
        return -ENOTCONN;
 }
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
-                         int status)
-{
-        struct ib_send_wr err_wr;
-        struct page *p;
-        struct svc_rdma_op_ctxt *ctxt;
-        enum rpcrdma_errcode err;
-        __be32 *va;
-        int length;
-        int ret;
-        ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
-        if (ret)
-                return;
-        p = alloc_page(GFP_KERNEL);
-        if (!p)
-                return;
-        va = page_address(p);
-        /* XDR encode an error reply */
-        err = ERR_CHUNK;
-        if (status == -EPROTONOSUPPORT)
-                err = ERR_VERS;
-        length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
-        ctxt = svc_rdma_get_context(xprt);
-        ctxt->direction = DMA_TO_DEVICE;
-        ctxt->count = 1;
-        ctxt->pages[0] = p;
-        /* Prepare SGE for local address */
-        ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
-        ctxt->sge[0].length = length;
-        ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
-                                            p, 0, length, DMA_TO_DEVICE);
-        if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
-                dprintk("svcrdma: Error mapping buffer for protocol error\n");
-                svc_rdma_put_context(ctxt, 1);
-                return;
-        }
-        svc_rdma_count_mappings(xprt, ctxt);
-        /* Prepare SEND WR */
-        memset(&err_wr, 0, sizeof(err_wr));
-        ctxt->cqe.done = svc_rdma_wc_send;
-        err_wr.wr_cqe = &ctxt->cqe;
-        err_wr.sg_list = ctxt->sge;
-        err_wr.num_sge = 1;
-        err_wr.opcode = IB_WR_SEND;
-        err_wr.send_flags = IB_SEND_SIGNALED;
-        /* Post It */
-        ret = svc_rdma_send(xprt, &err_wr);
-        if (ret) {
-                dprintk("svcrdma: Error %d posting send for protocol error\n",
-                        ret);
-                svc_rdma_unmap_dma(ctxt);
-                svc_rdma_put_context(ctxt, 1);
-        }
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fc8f14c7bfec..a9d9cb1ba4c6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
        }
 }
-static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
-{
-        struct svc_rdma_req_map *map;
-        map = kmalloc(sizeof(*map), flags);
-        if (map)
-                INIT_LIST_HEAD(&map->free);
-        return map;
-}
-static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
-{
-        unsigned int i;
-        /* One for each receive buffer on this connection. */
-        i = xprt->sc_max_requests;
-        while (i--) {
-                struct svc_rdma_req_map *map;
-                map = alloc_req_map(GFP_KERNEL);
-                if (!map) {
-                        dprintk("svcrdma: No memory for request map\n");
-                        return false;
-                }
-                list_add(&map->free, &xprt->sc_maps);
-        }
-        return true;
-}
-struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
-{
-        struct svc_rdma_req_map *map = NULL;
-        spin_lock(&xprt->sc_map_lock);
-        if (list_empty(&xprt->sc_maps))
-                goto out_empty;
-        map = list_first_entry(&xprt->sc_maps,
-                               struct svc_rdma_req_map, free);
-        list_del_init(&map->free);
-        spin_unlock(&xprt->sc_map_lock);
-out:
-        map->count = 0;
-        return map;
-out_empty:
-        spin_unlock(&xprt->sc_map_lock);
-        /* Pre-allocation amount was incorrect */
-        map = alloc_req_map(GFP_NOIO);
-        if (map)
-                goto out;
-        WARN_ONCE(1, "svcrdma: empty request map list?\n");
-        return NULL;
-}
-void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
-                          struct svc_rdma_req_map *map)
-{
-        spin_lock(&xprt->sc_map_lock);
-        list_add(&map->free, &xprt->sc_maps);
-        spin_unlock(&xprt->sc_map_lock);
-}
-static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
-{
-        while (!list_empty(&xprt->sc_maps)) {
-                struct svc_rdma_req_map *map;
-                map = list_first_entry(&xprt->sc_maps,
-                                       struct svc_rdma_req_map, free);
-                list_del(&map->free);
-                kfree(map);
-        }
-}
 /* QP event handler */
 static void qp_event_handler(struct ib_event *event, void *context)
 {
@@ -474,24 +395,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 }
 /**
- * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
- * @cq:        completion queue
- * @wc:        completed WR
- *
- */
-void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
-{
-        struct ib_cqe *cqe = wc->wr_cqe;
-        struct svc_rdma_op_ctxt *ctxt;
-        svc_rdma_send_wc_common_put(cq, wc, "write");
-        ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
-        svc_rdma_unmap_dma(ctxt);
-        svc_rdma_put_context(ctxt, 0);
-}
-/**
 * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
 * @cq:        completion queue
 * @wc:        completed WR
@@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
        INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
        INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
        INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
-        INIT_LIST_HEAD(&cma_xprt->sc_maps);
+        INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
        init_waitqueue_head(&cma_xprt->sc_send_wait);
        spin_lock_init(&cma_xprt->sc_lock);
        spin_lock_init(&cma_xprt->sc_rq_dto_lock);
        spin_lock_init(&cma_xprt->sc_frmr_q_lock);
        spin_lock_init(&cma_xprt->sc_ctxt_lock);
-        spin_lock_init(&cma_xprt->sc_map_lock);
+        spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
        /*
         * Note that this implies that the underlying transport support
@@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                newxprt, newxprt->sc_cm_id);
        dev = newxprt->sc_cm_id->device;
+        newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
        /* Qualify the transport resource defaults with the
         * capabilities of this particular device */
@@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                                            svcrdma_max_bc_requests);
        newxprt->sc_rq_depth = newxprt->sc_max_requests +
                               newxprt->sc_max_bc_requests;
-        newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+        newxprt->sc_sq_depth = newxprt->sc_rq_depth;
        atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
        if (!svc_rdma_prealloc_ctxts(newxprt))
                goto errout;
-        if (!svc_rdma_prealloc_maps(newxprt))
-                goto errout;
        /*
         * Limit ORD based on client limit, local device limit, and
@@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
        memset(&qp_attr, 0, sizeof qp_attr);
        qp_attr.event_handler = qp_event_handler;
        qp_attr.qp_context = &newxprt->sc_xprt;
+        qp_attr.port_num = newxprt->sc_cm_id->port_num;
+        qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
        qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
        qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
        qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
@@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work)
        }
        rdma_dealloc_frmr_q(rdma);
+        svc_rdma_destroy_rw_ctxts(rdma);
        svc_rdma_destroy_ctxts(rdma);
-        svc_rdma_destroy_maps(rdma);
        /* Destroy the QP if present (not a listener) */
        if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))