aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/nfs/callback_proc.c9
-rw-r--r--fs/nfs/callback_xdr.c17
-rw-r--r--fs/nfs/delegation.c9
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/direct.c17
-rw-r--r--fs/nfs/filelayout/filelayout.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c200
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h17
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c119
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c107
-rw-r--r--fs/nfs/nfs42xdr.c146
-rw-r--r--fs/nfs/nfs4_fs.h12
-rw-r--r--fs/nfs/nfs4file.c23
-rw-r--r--fs/nfs/nfs4proc.c185
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/nfs4trace.h10
-rw-r--r--fs/nfs/nfs4xdr.c43
-rw-r--r--fs/nfs/pagelist.c6
-rw-r--r--fs/nfs/pnfs.c349
-rw-r--r--fs/nfs/pnfs.h17
-rw-r--r--fs/nfs/pnfs_nfs.c60
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs/write.c64
-rw-r--r--include/linux/errno.h1
-rw-r--r--include/linux/nfs4.h28
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/nfs_xdr.h30
-rw-r--r--include/linux/sunrpc/auth.h26
-rw-r--r--include/linux/sunrpc/clnt.h1
-rw-r--r--include/linux/sunrpc/msg_prot.h4
-rw-r--r--include/linux/sunrpc/xprt.h1
-rw-r--r--include/linux/sunrpc/xprtrdma.h4
-rw-r--r--net/sunrpc/auth.c9
-rw-r--r--net/sunrpc/auth_generic.c13
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c6
-rw-r--r--net/sunrpc/auth_unix.c6
-rw-r--r--net/sunrpc/clnt.c17
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c16
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c134
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c214
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c39
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c517
-rw-r--r--net/sunrpc/xprtrdma/transport.c16
-rw-r--r--net/sunrpc/xprtrdma/verbs.c78
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h47
-rw-r--r--net/sunrpc/xprtsock.c6
49 files changed, 1764 insertions, 899 deletions
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 618ced381a14..aaa2e8d3df6f 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -217,7 +217,8 @@ static u32 initiate_file_draining(struct nfs_client *clp,
217 } 217 }
218 218
219 if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, 219 if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
220 &args->cbl_range)) { 220 &args->cbl_range,
221 be32_to_cpu(args->cbl_stateid.seqid))) {
221 rv = NFS4_OK; 222 rv = NFS4_OK;
222 goto unlock; 223 goto unlock;
223 } 224 }
@@ -500,8 +501,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
500 cps->slot = slot; 501 cps->slot = slot;
501 502
502 /* The ca_maxresponsesize_cached is 0 with no DRC */ 503 /* The ca_maxresponsesize_cached is 0 with no DRC */
503 if (args->csa_cachethis != 0) 504 if (args->csa_cachethis != 0) {
504 return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); 505 status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
506 goto out_unlock;
507 }
505 508
506 /* 509 /*
507 * Check for pending referring calls. If a match is found, a 510 * Check for pending referring calls. If a match is found, a
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 976c90608e56..d81f96aacd51 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
146 p = read_buf(xdr, NFS4_STATEID_SIZE); 146 p = read_buf(xdr, NFS4_STATEID_SIZE);
147 if (unlikely(p == NULL)) 147 if (unlikely(p == NULL))
148 return htonl(NFS4ERR_RESOURCE); 148 return htonl(NFS4ERR_RESOURCE);
149 memcpy(stateid, p, NFS4_STATEID_SIZE); 149 memcpy(stateid->data, p, NFS4_STATEID_SIZE);
150 return 0; 150 return 0;
151} 151}
152 152
153static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
154{
155 stateid->type = NFS4_DELEGATION_STATEID_TYPE;
156 return decode_stateid(xdr, stateid);
157}
158
153static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) 159static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
154{ 160{
155 __be32 *p; 161 __be32 *p;
@@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
211 __be32 *p; 217 __be32 *p;
212 __be32 status; 218 __be32 status;
213 219
214 status = decode_stateid(xdr, &args->stateid); 220 status = decode_delegation_stateid(xdr, &args->stateid);
215 if (unlikely(status != 0)) 221 if (unlikely(status != 0))
216 goto out; 222 goto out;
217 p = read_buf(xdr, 4); 223 p = read_buf(xdr, 4);
@@ -227,6 +233,11 @@ out:
227} 233}
228 234
229#if defined(CONFIG_NFS_V4_1) 235#if defined(CONFIG_NFS_V4_1)
236static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
237{
238 stateid->type = NFS4_LAYOUT_STATEID_TYPE;
239 return decode_stateid(xdr, stateid);
240}
230 241
231static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, 242static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
232 struct xdr_stream *xdr, 243 struct xdr_stream *xdr,
@@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
263 } 274 }
264 p = xdr_decode_hyper(p, &args->cbl_range.offset); 275 p = xdr_decode_hyper(p, &args->cbl_range.offset);
265 p = xdr_decode_hyper(p, &args->cbl_range.length); 276 p = xdr_decode_hyper(p, &args->cbl_range.length);
266 status = decode_stateid(xdr, &args->cbl_stateid); 277 status = decode_layout_stateid(xdr, &args->cbl_stateid);
267 if (unlikely(status != 0)) 278 if (unlikely(status != 0))
268 goto out; 279 goto out;
269 } else if (args->cbl_recall_type == RETURN_FSID) { 280 } else if (args->cbl_recall_type == RETURN_FSID) {
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5166adcfc0fb..322c2585bc34 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp)
875 875
876/** 876/**
877 * nfs4_copy_delegation_stateid - Copy inode's state ID information 877 * nfs4_copy_delegation_stateid - Copy inode's state ID information
878 * @dst: stateid data structure to fill in
879 * @inode: inode to check 878 * @inode: inode to check
880 * @flags: delegation type requirement 879 * @flags: delegation type requirement
880 * @dst: stateid data structure to fill in
881 * @cred: optional argument to retrieve credential
881 * 882 *
882 * Returns "true" and fills in "dst->data" * if inode had a delegation, 883 * Returns "true" and fills in "dst->data" * if inode had a delegation,
883 * otherwise "false" is returned. 884 * otherwise "false" is returned.
884 */ 885 */
885bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, 886bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
886 fmode_t flags) 887 nfs4_stateid *dst, struct rpc_cred **cred)
887{ 888{
888 struct nfs_inode *nfsi = NFS_I(inode); 889 struct nfs_inode *nfsi = NFS_I(inode);
889 struct nfs_delegation *delegation; 890 struct nfs_delegation *delegation;
@@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
896 if (ret) { 897 if (ret) {
897 nfs4_stateid_copy(dst, &delegation->stateid); 898 nfs4_stateid_copy(dst, &delegation->stateid);
898 nfs_mark_delegation_referenced(delegation); 899 nfs_mark_delegation_referenced(delegation);
900 if (cred)
901 *cred = get_rpccred(delegation->cred);
899 } 902 }
900 rcu_read_unlock(); 903 rcu_read_unlock();
901 return ret; 904 return ret;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 333063e032f0..64724d252a79 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
56int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); 56int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
57int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); 57int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
58int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); 58int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
59bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); 59bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
60 60
61void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); 61void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
62int nfs4_have_delegation(struct inode *inode, fmode_t flags); 62int nfs4_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 741a92c470bb..979b3c4dee6a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -87,6 +87,7 @@ struct nfs_direct_req {
87 int mirror_count; 87 int mirror_count;
88 88
89 ssize_t count, /* bytes actually processed */ 89 ssize_t count, /* bytes actually processed */
90 max_count, /* max expected count */
90 bytes_left, /* bytes left to be sent */ 91 bytes_left, /* bytes left to be sent */
91 io_start, /* start of IO */ 92 io_start, /* start of IO */
92 error; /* any reported error */ 93 error; /* any reported error */
@@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
123 int i; 124 int i;
124 ssize_t count; 125 ssize_t count;
125 126
127 WARN_ON_ONCE(dreq->count >= dreq->max_count);
128
126 if (dreq->mirror_count == 1) { 129 if (dreq->mirror_count == 1) {
127 dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; 130 dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
128 dreq->count += hdr->good_bytes; 131 dreq->count += hdr->good_bytes;
@@ -275,7 +278,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
275void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, 278void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
276 struct nfs_direct_req *dreq) 279 struct nfs_direct_req *dreq)
277{ 280{
278 cinfo->lock = &dreq->inode->i_lock; 281 cinfo->inode = dreq->inode;
279 cinfo->mds = &dreq->mds_cinfo; 282 cinfo->mds = &dreq->mds_cinfo;
280 cinfo->ds = &dreq->ds_cinfo; 283 cinfo->ds = &dreq->ds_cinfo;
281 cinfo->dreq = dreq; 284 cinfo->dreq = dreq;
@@ -591,7 +594,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
591 goto out_unlock; 594 goto out_unlock;
592 595
593 dreq->inode = inode; 596 dreq->inode = inode;
594 dreq->bytes_left = count; 597 dreq->bytes_left = dreq->max_count = count;
595 dreq->io_start = iocb->ki_pos; 598 dreq->io_start = iocb->ki_pos;
596 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 599 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
597 l_ctx = nfs_get_lock_context(dreq->ctx); 600 l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -630,13 +633,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
630 struct list_head *list, 633 struct list_head *list,
631 struct nfs_commit_info *cinfo) 634 struct nfs_commit_info *cinfo)
632{ 635{
633 spin_lock(cinfo->lock); 636 spin_lock(&cinfo->inode->i_lock);
634#ifdef CONFIG_NFS_V4_1 637#ifdef CONFIG_NFS_V4_1
635 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) 638 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
636 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); 639 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
637#endif 640#endif
638 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); 641 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
639 spin_unlock(cinfo->lock); 642 spin_unlock(&cinfo->inode->i_lock);
640} 643}
641 644
642static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 645static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
@@ -671,13 +674,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
671 if (!nfs_pageio_add_request(&desc, req)) { 674 if (!nfs_pageio_add_request(&desc, req)) {
672 nfs_list_remove_request(req); 675 nfs_list_remove_request(req);
673 nfs_list_add_request(req, &failed); 676 nfs_list_add_request(req, &failed);
674 spin_lock(cinfo.lock); 677 spin_lock(&cinfo.inode->i_lock);
675 dreq->flags = 0; 678 dreq->flags = 0;
676 if (desc.pg_error < 0) 679 if (desc.pg_error < 0)
677 dreq->error = desc.pg_error; 680 dreq->error = desc.pg_error;
678 else 681 else
679 dreq->error = -EIO; 682 dreq->error = -EIO;
680 spin_unlock(cinfo.lock); 683 spin_unlock(&cinfo.inode->i_lock);
681 } 684 }
682 nfs_release_request(req); 685 nfs_release_request(req);
683 } 686 }
@@ -1023,7 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1023 goto out_unlock; 1026 goto out_unlock;
1024 1027
1025 dreq->inode = inode; 1028 dreq->inode = inode;
1026 dreq->bytes_left = iov_iter_count(iter); 1029 dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
1027 dreq->io_start = pos; 1030 dreq->io_start = pos;
1028 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 1031 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
1029 l_ctx = nfs_get_lock_context(dreq->ctx); 1032 l_ctx = nfs_get_lock_context(dreq->ctx);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3384dc8e6683..aa59757389dc 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
795 buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; 795 buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
796 } 796 }
797 797
798 spin_lock(cinfo->lock); 798 spin_lock(&cinfo->inode->i_lock);
799 if (cinfo->ds->nbuckets >= size) 799 if (cinfo->ds->nbuckets >= size)
800 goto out; 800 goto out;
801 for (i = 0; i < cinfo->ds->nbuckets; i++) { 801 for (i = 0; i < cinfo->ds->nbuckets; i++) {
@@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
811 swap(cinfo->ds->buckets, buckets); 811 swap(cinfo->ds->buckets, buckets);
812 cinfo->ds->nbuckets = size; 812 cinfo->ds->nbuckets = size;
813out: 813out:
814 spin_unlock(cinfo->lock); 814 spin_unlock(&cinfo->inode->i_lock);
815 kfree(buckets); 815 kfree(buckets);
816 return 0; 816 return 0;
817} 817}
@@ -890,6 +890,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
890 0, 890 0,
891 NFS4_MAX_UINT64, 891 NFS4_MAX_UINT64,
892 IOMODE_READ, 892 IOMODE_READ,
893 false,
893 GFP_KERNEL); 894 GFP_KERNEL);
894 if (IS_ERR(pgio->pg_lseg)) { 895 if (IS_ERR(pgio->pg_lseg)) {
895 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 896 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -915,6 +916,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
915 0, 916 0,
916 NFS4_MAX_UINT64, 917 NFS4_MAX_UINT64,
917 IOMODE_RW, 918 IOMODE_RW,
919 false,
918 GFP_NOFS); 920 GFP_NOFS);
919 if (IS_ERR(pgio->pg_lseg)) { 921 if (IS_ERR(pgio->pg_lseg)) {
920 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 922 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0cb1abd535e3..0e8018bc9880 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -26,6 +26,8 @@
26 26
27#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) 27#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
28 28
29static struct group_info *ff_zero_group;
30
29static struct pnfs_layout_hdr * 31static struct pnfs_layout_hdr *
30ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) 32ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
31{ 33{
@@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
53 kfree(FF_LAYOUT_FROM_HDR(lo)); 55 kfree(FF_LAYOUT_FROM_HDR(lo));
54} 56}
55 57
56static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) 58static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
57{ 59{
58 __be32 *p; 60 __be32 *p;
59 61
60 p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); 62 p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
61 if (unlikely(p == NULL)) 63 if (unlikely(p == NULL))
62 return -ENOBUFS; 64 return -ENOBUFS;
63 memcpy(stateid, p, NFS4_STATEID_SIZE); 65 stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
66 memcpy(stateid->data, p, NFS4_STATEID_SIZE);
64 dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, 67 dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
65 p[0], p[1], p[2], p[3]); 68 p[0], p[1], p[2], p[3]);
66 return 0; 69 return 0;
@@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
211 214
212static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) 215static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
213{ 216{
217 struct rpc_cred *cred;
218
214 ff_layout_remove_mirror(mirror); 219 ff_layout_remove_mirror(mirror);
215 kfree(mirror->fh_versions); 220 kfree(mirror->fh_versions);
216 if (mirror->cred) 221 cred = rcu_access_pointer(mirror->ro_cred);
217 put_rpccred(mirror->cred); 222 if (cred)
223 put_rpccred(cred);
224 cred = rcu_access_pointer(mirror->rw_cred);
225 if (cred)
226 put_rpccred(cred);
218 nfs4_ff_layout_put_deviceid(mirror->mirror_ds); 227 nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
219 kfree(mirror); 228 kfree(mirror);
220} 229}
@@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
290{ 299{
291 u64 new_end, old_end; 300 u64 new_end, old_end;
292 301
302 if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
303 return false;
293 if (new->pls_range.iomode != old->pls_range.iomode) 304 if (new->pls_range.iomode != old->pls_range.iomode)
294 return false; 305 return false;
295 old_end = pnfs_calc_offset_end(old->pls_range.offset, 306 old_end = pnfs_calc_offset_end(old->pls_range.offset,
@@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
310 new_end); 321 new_end);
311 if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) 322 if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
312 set_bit(NFS_LSEG_ROC, &new->pls_flags); 323 set_bit(NFS_LSEG_ROC, &new->pls_flags);
313 if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
314 set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
315 return true; 324 return true;
316} 325}
317 326
@@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
407 struct nfs4_ff_layout_mirror *mirror; 416 struct nfs4_ff_layout_mirror *mirror;
408 struct nfs4_deviceid devid; 417 struct nfs4_deviceid devid;
409 struct nfs4_deviceid_node *idnode; 418 struct nfs4_deviceid_node *idnode;
410 u32 ds_count; 419 struct auth_cred acred = { .group_info = ff_zero_group };
411 u32 fh_count; 420 struct rpc_cred __rcu *cred;
421 u32 ds_count, fh_count, id;
412 int j; 422 int j;
413 423
414 rc = -EIO; 424 rc = -EIO;
@@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
456 fls->mirror_array[i]->efficiency = be32_to_cpup(p); 466 fls->mirror_array[i]->efficiency = be32_to_cpup(p);
457 467
458 /* stateid */ 468 /* stateid */
459 rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); 469 rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
460 if (rc) 470 if (rc)
461 goto out_err_free; 471 goto out_err_free;
462 472
@@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
484 fls->mirror_array[i]->fh_versions_cnt = fh_count; 494 fls->mirror_array[i]->fh_versions_cnt = fh_count;
485 495
486 /* user */ 496 /* user */
487 rc = decode_name(&stream, &fls->mirror_array[i]->uid); 497 rc = decode_name(&stream, &id);
488 if (rc) 498 if (rc)
489 goto out_err_free; 499 goto out_err_free;
490 500
501 acred.uid = make_kuid(&init_user_ns, id);
502
491 /* group */ 503 /* group */
492 rc = decode_name(&stream, &fls->mirror_array[i]->gid); 504 rc = decode_name(&stream, &id);
493 if (rc) 505 if (rc)
494 goto out_err_free; 506 goto out_err_free;
495 507
508 acred.gid = make_kgid(&init_user_ns, id);
509
510 /* find the cred for it */
511 rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags));
512 if (IS_ERR(cred)) {
513 rc = PTR_ERR(cred);
514 goto out_err_free;
515 }
516
517 if (lgr->range.iomode == IOMODE_READ)
518 rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
519 else
520 rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
521
496 mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); 522 mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
497 if (mirror != fls->mirror_array[i]) { 523 if (mirror != fls->mirror_array[i]) {
524 /* swap cred ptrs so free_mirror will clean up old */
525 if (lgr->range.iomode == IOMODE_READ) {
526 cred = xchg(&mirror->ro_cred, cred);
527 rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
528 } else {
529 cred = xchg(&mirror->rw_cred, cred);
530 rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
531 }
498 ff_layout_free_mirror(fls->mirror_array[i]); 532 ff_layout_free_mirror(fls->mirror_array[i]);
499 fls->mirror_array[i] = mirror; 533 fls->mirror_array[i] = mirror;
500 } 534 }
501 535
502 dprintk("%s: uid %d gid %d\n", __func__, 536 dprintk("%s: iomode %s uid %u gid %u\n", __func__,
503 fls->mirror_array[i]->uid, 537 lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
504 fls->mirror_array[i]->gid); 538 from_kuid(&init_user_ns, acred.uid),
539 from_kgid(&init_user_ns, acred.gid));
505 } 540 }
506 541
507 p = xdr_inline_decode(&stream, 4); 542 p = xdr_inline_decode(&stream, 4);
@@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
745 else { 780 else {
746 int i; 781 int i;
747 782
748 spin_lock(cinfo->lock); 783 spin_lock(&cinfo->inode->i_lock);
749 if (cinfo->ds->nbuckets != 0) 784 if (cinfo->ds->nbuckets != 0)
750 kfree(buckets); 785 kfree(buckets);
751 else { 786 else {
@@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
759 NFS_INVALID_STABLE_HOW; 794 NFS_INVALID_STABLE_HOW;
760 } 795 }
761 } 796 }
762 spin_unlock(cinfo->lock); 797 spin_unlock(&cinfo->inode->i_lock);
763 return 0; 798 return 0;
764 } 799 }
765} 800}
@@ -786,6 +821,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
786} 821}
787 822
788static void 823static void
824ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
825 struct nfs_page *req,
826 bool strict_iomode)
827{
828retry_strict:
829 pnfs_put_lseg(pgio->pg_lseg);
830 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
831 req->wb_context,
832 0,
833 NFS4_MAX_UINT64,
834 IOMODE_READ,
835 strict_iomode,
836 GFP_KERNEL);
837 if (IS_ERR(pgio->pg_lseg)) {
838 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
839 pgio->pg_lseg = NULL;
840 }
841
842 /* If we don't have checking, do get a IOMODE_RW
843 * segment, and the server wants to avoid READs
844 * there, then retry!
845 */
846 if (pgio->pg_lseg && !strict_iomode &&
847 ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
848 strict_iomode = true;
849 goto retry_strict;
850 }
851}
852
853static void
789ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, 854ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
790 struct nfs_page *req) 855 struct nfs_page *req)
791{ 856{
@@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
795 int ds_idx; 860 int ds_idx;
796 861
797 /* Use full layout for now */ 862 /* Use full layout for now */
798 if (!pgio->pg_lseg) { 863 if (!pgio->pg_lseg)
799 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 864 ff_layout_pg_get_read(pgio, req, false);
800 req->wb_context, 865 else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg))
801 0, 866 ff_layout_pg_get_read(pgio, req, true);
802 NFS4_MAX_UINT64, 867
803 IOMODE_READ,
804 GFP_KERNEL);
805 if (IS_ERR(pgio->pg_lseg)) {
806 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
807 pgio->pg_lseg = NULL;
808 return;
809 }
810 }
811 /* If no lseg, fall back to read through mds */ 868 /* If no lseg, fall back to read through mds */
812 if (pgio->pg_lseg == NULL) 869 if (pgio->pg_lseg == NULL)
813 goto out_mds; 870 goto out_mds;
814 871
815 ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); 872 ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
816 if (!ds) 873 if (!ds) {
817 goto out_mds; 874 if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
875 goto out_pnfs;
876 else
877 goto out_mds;
878 }
879
818 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); 880 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
819 881
820 pgio->pg_mirror_idx = ds_idx; 882 pgio->pg_mirror_idx = ds_idx;
@@ -828,6 +890,12 @@ out_mds:
828 pnfs_put_lseg(pgio->pg_lseg); 890 pnfs_put_lseg(pgio->pg_lseg);
829 pgio->pg_lseg = NULL; 891 pgio->pg_lseg = NULL;
830 nfs_pageio_reset_read_mds(pgio); 892 nfs_pageio_reset_read_mds(pgio);
893 return;
894
895out_pnfs:
896 pnfs_set_lo_fail(pgio->pg_lseg);
897 pnfs_put_lseg(pgio->pg_lseg);
898 pgio->pg_lseg = NULL;
831} 899}
832 900
833static void 901static void
@@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
847 0, 915 0,
848 NFS4_MAX_UINT64, 916 NFS4_MAX_UINT64,
849 IOMODE_RW, 917 IOMODE_RW,
918 false,
850 GFP_NOFS); 919 GFP_NOFS);
851 if (IS_ERR(pgio->pg_lseg)) { 920 if (IS_ERR(pgio->pg_lseg)) {
852 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 921 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
870 939
871 for (i = 0; i < pgio->pg_mirror_count; i++) { 940 for (i = 0; i < pgio->pg_mirror_count; i++) {
872 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); 941 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
873 if (!ds) 942 if (!ds) {
874 goto out_mds; 943 if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
944 goto out_pnfs;
945 else
946 goto out_mds;
947 }
875 pgm = &pgio->pg_mirrors[i]; 948 pgm = &pgio->pg_mirrors[i];
876 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); 949 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
877 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; 950 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
@@ -883,6 +956,12 @@ out_mds:
883 pnfs_put_lseg(pgio->pg_lseg); 956 pnfs_put_lseg(pgio->pg_lseg);
884 pgio->pg_lseg = NULL; 957 pgio->pg_lseg = NULL;
885 nfs_pageio_reset_write_mds(pgio); 958 nfs_pageio_reset_write_mds(pgio);
959 return;
960
961out_pnfs:
962 pnfs_set_lo_fail(pgio->pg_lseg);
963 pnfs_put_lseg(pgio->pg_lseg);
964 pgio->pg_lseg = NULL;
886} 965}
887 966
888static unsigned int 967static unsigned int
@@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
895 0, 974 0,
896 NFS4_MAX_UINT64, 975 NFS4_MAX_UINT64,
897 IOMODE_RW, 976 IOMODE_RW,
977 false,
898 GFP_NOFS); 978 GFP_NOFS);
899 if (IS_ERR(pgio->pg_lseg)) { 979 if (IS_ERR(pgio->pg_lseg)) {
900 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 980 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
1067 rpc_wake_up(&tbl->slot_tbl_waitq); 1147 rpc_wake_up(&tbl->slot_tbl_waitq);
1068 /* fall through */ 1148 /* fall through */
1069 default: 1149 default:
1070 if (ff_layout_no_fallback_to_mds(lseg) || 1150 if (ff_layout_avoid_mds_available_ds(lseg))
1071 ff_layout_has_available_ds(lseg))
1072 return -NFS4ERR_RESET_TO_PNFS; 1151 return -NFS4ERR_RESET_TO_PNFS;
1073reset: 1152reset:
1074 dprintk("%s Retry through MDS. Error %d\n", __func__, 1153 dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
1215 hdr->pgio_mirror_idx + 1, 1294 hdr->pgio_mirror_idx + 1,
1216 &hdr->pgio_mirror_idx)) 1295 &hdr->pgio_mirror_idx))
1217 goto out_eagain; 1296 goto out_eagain;
1218 set_bit(NFS_LAYOUT_RETURN_REQUESTED,
1219 &hdr->lseg->pls_layout->plh_flags);
1220 pnfs_read_resend_pnfs(hdr); 1297 pnfs_read_resend_pnfs(hdr);
1221 return task->tk_status; 1298 return task->tk_status;
1222 case -NFS4ERR_RESET_TO_MDS: 1299 case -NFS4ERR_RESET_TO_MDS:
@@ -1260,7 +1337,7 @@ ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
1260} 1337}
1261 1338
1262static bool 1339static bool
1263ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) 1340ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
1264{ 1341{
1265 /* No mirroring for now */ 1342 /* No mirroring for now */
1266 struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); 1343 struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
1297 rpc_exit(task, -EIO); 1374 rpc_exit(task, -EIO);
1298 return -EIO; 1375 return -EIO;
1299 } 1376 }
1300 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { 1377 if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
1301 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); 1378 rpc_exit(task, -EHOSTDOWN);
1302 if (ff_layout_has_available_ds(hdr->lseg))
1303 pnfs_read_resend_pnfs(hdr);
1304 else
1305 ff_layout_reset_read(hdr);
1306 rpc_exit(task, 0);
1307 return -EAGAIN; 1379 return -EAGAIN;
1308 } 1380 }
1309 hdr->pgio_done_cb = ff_layout_read_done_cb;
1310 1381
1311 ff_layout_read_record_layoutstats_start(task, hdr); 1382 ff_layout_read_record_layoutstats_start(task, hdr);
1312 return 0; 1383 return 0;
@@ -1496,14 +1567,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
1496 return -EIO; 1567 return -EIO;
1497 } 1568 }
1498 1569
1499 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { 1570 if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
1500 bool retry_pnfs; 1571 rpc_exit(task, -EHOSTDOWN);
1501
1502 retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
1503 dprintk("%s task %u reset io to %s\n", __func__,
1504 task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
1505 ff_layout_reset_write(hdr, retry_pnfs);
1506 rpc_exit(task, 0);
1507 return -EAGAIN; 1572 return -EAGAIN;
1508 } 1573 }
1509 1574
@@ -1712,7 +1777,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1712 goto out_failed; 1777 goto out_failed;
1713 1778
1714 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); 1779 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1715 if (IS_ERR(ds_cred)) 1780 if (!ds_cred)
1716 goto out_failed; 1781 goto out_failed;
1717 1782
1718 vers = nfs4_ff_layout_ds_version(lseg, idx); 1783 vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1720,6 +1785,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1720 dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, 1785 dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
1721 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); 1786 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
1722 1787
1788 hdr->pgio_done_cb = ff_layout_read_done_cb;
1723 atomic_inc(&ds->ds_clp->cl_count); 1789 atomic_inc(&ds->ds_clp->cl_count);
1724 hdr->ds_clp = ds->ds_clp; 1790 hdr->ds_clp = ds->ds_clp;
1725 fh = nfs4_ff_layout_select_ds_fh(lseg, idx); 1791 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1737,11 +1803,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1737 vers == 3 ? &ff_layout_read_call_ops_v3 : 1803 vers == 3 ? &ff_layout_read_call_ops_v3 :
1738 &ff_layout_read_call_ops_v4, 1804 &ff_layout_read_call_ops_v4,
1739 0, RPC_TASK_SOFTCONN); 1805 0, RPC_TASK_SOFTCONN);
1740 1806 put_rpccred(ds_cred);
1741 return PNFS_ATTEMPTED; 1807 return PNFS_ATTEMPTED;
1742 1808
1743out_failed: 1809out_failed:
1744 if (ff_layout_has_available_ds(lseg)) 1810 if (ff_layout_avoid_mds_available_ds(lseg))
1745 return PNFS_TRY_AGAIN; 1811 return PNFS_TRY_AGAIN;
1746 return PNFS_NOT_ATTEMPTED; 1812 return PNFS_NOT_ATTEMPTED;
1747} 1813}
@@ -1769,7 +1835,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
1769 return PNFS_NOT_ATTEMPTED; 1835 return PNFS_NOT_ATTEMPTED;
1770 1836
1771 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); 1837 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1772 if (IS_ERR(ds_cred)) 1838 if (!ds_cred)
1773 return PNFS_NOT_ATTEMPTED; 1839 return PNFS_NOT_ATTEMPTED;
1774 1840
1775 vers = nfs4_ff_layout_ds_version(lseg, idx); 1841 vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1798,6 +1864,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
1798 vers == 3 ? &ff_layout_write_call_ops_v3 : 1864 vers == 3 ? &ff_layout_write_call_ops_v3 :
1799 &ff_layout_write_call_ops_v4, 1865 &ff_layout_write_call_ops_v4,
1800 sync, RPC_TASK_SOFTCONN); 1866 sync, RPC_TASK_SOFTCONN);
1867 put_rpccred(ds_cred);
1801 return PNFS_ATTEMPTED; 1868 return PNFS_ATTEMPTED;
1802} 1869}
1803 1870
@@ -1824,7 +1891,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1824 struct rpc_clnt *ds_clnt; 1891 struct rpc_clnt *ds_clnt;
1825 struct rpc_cred *ds_cred; 1892 struct rpc_cred *ds_cred;
1826 u32 idx; 1893 u32 idx;
1827 int vers; 1894 int vers, ret;
1828 struct nfs_fh *fh; 1895 struct nfs_fh *fh;
1829 1896
1830 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); 1897 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
@@ -1838,7 +1905,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1838 goto out_err; 1905 goto out_err;
1839 1906
1840 ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); 1907 ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
1841 if (IS_ERR(ds_cred)) 1908 if (!ds_cred)
1842 goto out_err; 1909 goto out_err;
1843 1910
1844 vers = nfs4_ff_layout_ds_version(lseg, idx); 1911 vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1854,10 +1921,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1854 if (fh) 1921 if (fh)
1855 data->args.fh = fh; 1922 data->args.fh = fh;
1856 1923
1857 return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, 1924 ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
1858 vers == 3 ? &ff_layout_commit_call_ops_v3 : 1925 vers == 3 ? &ff_layout_commit_call_ops_v3 :
1859 &ff_layout_commit_call_ops_v4, 1926 &ff_layout_commit_call_ops_v4,
1860 how, RPC_TASK_SOFTCONN); 1927 how, RPC_TASK_SOFTCONN);
1928 put_rpccred(ds_cred);
1929 return ret;
1861out_err: 1930out_err:
1862 pnfs_generic_prepare_to_resend_writes(data); 1931 pnfs_generic_prepare_to_resend_writes(data);
1863 pnfs_generic_commit_release(data); 1932 pnfs_generic_commit_release(data);
@@ -2223,6 +2292,11 @@ static int __init nfs4flexfilelayout_init(void)
2223{ 2292{
2224 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", 2293 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
2225 __func__); 2294 __func__);
2295 if (!ff_zero_group) {
2296 ff_zero_group = groups_alloc(0);
2297 if (!ff_zero_group)
2298 return -ENOMEM;
2299 }
2226 return pnfs_register_layoutdriver(&flexfilelayout_type); 2300 return pnfs_register_layoutdriver(&flexfilelayout_type);
2227} 2301}
2228 2302
@@ -2231,6 +2305,10 @@ static void __exit nfs4flexfilelayout_exit(void)
2231 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", 2305 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
2232 __func__); 2306 __func__);
2233 pnfs_unregister_layoutdriver(&flexfilelayout_type); 2307 pnfs_unregister_layoutdriver(&flexfilelayout_type);
2308 if (ff_zero_group) {
2309 put_group_info(ff_zero_group);
2310 ff_zero_group = NULL;
2311 }
2234} 2312}
2235 2313
2236MODULE_ALIAS("nfs-layouttype4-4"); 2314MODULE_ALIAS("nfs-layouttype4-4");
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index dd353bb7dc0a..1bcdb15d0c41 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,7 +10,8 @@
10#define FS_NFS_NFS4FLEXFILELAYOUT_H 10#define FS_NFS_NFS4FLEXFILELAYOUT_H
11 11
12#define FF_FLAGS_NO_LAYOUTCOMMIT 1 12#define FF_FLAGS_NO_LAYOUTCOMMIT 1
13#define FF_FLAGS_NO_IO_THRU_MDS 2 13#define FF_FLAGS_NO_IO_THRU_MDS 2
14#define FF_FLAGS_NO_READ_IO 4
14 15
15#include "../pnfs.h" 16#include "../pnfs.h"
16 17
@@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror {
76 u32 fh_versions_cnt; 77 u32 fh_versions_cnt;
77 struct nfs_fh *fh_versions; 78 struct nfs_fh *fh_versions;
78 nfs4_stateid stateid; 79 nfs4_stateid stateid;
79 u32 uid; 80 struct rpc_cred __rcu *ro_cred;
80 u32 gid; 81 struct rpc_cred __rcu *rw_cred;
81 struct rpc_cred *cred;
82 atomic_t ref; 82 atomic_t ref;
83 spinlock_t lock; 83 spinlock_t lock;
84 struct nfs4_ff_layoutstat read_stat; 84 struct nfs4_ff_layoutstat read_stat;
@@ -154,6 +154,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
154} 154}
155 155
156static inline bool 156static inline bool
157ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
158{
159 return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
160}
161
162static inline bool
157ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) 163ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
158{ 164{
159 return nfs4_test_deviceid_unavailable(node); 165 return nfs4_test_deviceid_unavailable(node);
@@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
192struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, 198struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
193 u32 ds_idx, struct rpc_cred *mdscred); 199 u32 ds_idx, struct rpc_cred *mdscred);
194bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); 200bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
201bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
202bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
203
195#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ 204#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index add0e5a70bd6..0aa36be71fce 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
228 return e1->opnum < e2->opnum ? -1 : 1; 228 return e1->opnum < e2->opnum ? -1 : 1;
229 if (e1->status != e2->status) 229 if (e1->status != e2->status)
230 return e1->status < e2->status ? -1 : 1; 230 return e1->status < e2->status ? -1 : 1;
231 ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); 231 ret = memcmp(e1->stateid.data, e2->stateid.data,
232 sizeof(e1->stateid.data));
232 if (ret != 0) 233 if (ret != 0)
233 return ret; 234 return ret;
234 ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); 235 ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
@@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
302 return 0; 303 return 0;
303} 304}
304 305
305/* currently we only support AUTH_NONE and AUTH_SYS */ 306static struct rpc_cred *
306static rpc_authflavor_t 307ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
307nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
308{ 308{
309 if (mirror->uid == (u32)-1) 309 struct rpc_cred *cred, __rcu **pcred;
310 return RPC_AUTH_NULL;
311 return RPC_AUTH_UNIX;
312}
313 310
314/* fetch cred for NFSv3 DS */ 311 if (iomode == IOMODE_READ)
315static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, 312 pcred = &mirror->ro_cred;
316 struct nfs4_pnfs_ds *ds) 313 else
317{ 314 pcred = &mirror->rw_cred;
318 if (ds->ds_clp && !mirror->cred && 315
319 mirror->mirror_ds->ds_versions[0].version == 3) { 316 rcu_read_lock();
320 struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; 317 do {
321 struct rpc_cred *cred; 318 cred = rcu_dereference(*pcred);
322 struct auth_cred acred = { 319 if (!cred)
323 .uid = make_kuid(&init_user_ns, mirror->uid), 320 break;
324 .gid = make_kgid(&init_user_ns, mirror->gid), 321
325 }; 322 cred = get_rpccred_rcu(cred);
326 323 } while(!cred);
327 /* AUTH_NULL ignores acred */ 324 rcu_read_unlock();
328 cred = auth->au_ops->lookup_cred(auth, &acred, 0); 325 return cred;
329 if (IS_ERR(cred)) {
330 dprintk("%s: lookup_cred failed with %ld\n",
331 __func__, PTR_ERR(cred));
332 return PTR_ERR(cred);
333 } else {
334 if (cmpxchg(&mirror->cred, NULL, cred))
335 put_rpccred(cred);
336 }
337 }
338 return 0;
339} 326}
340 327
341struct nfs_fh * 328struct nfs_fh *
@@ -356,7 +343,23 @@ out:
356 return fh; 343 return fh;
357} 344}
358 345
359/* Upon return, either ds is connected, or ds is NULL */ 346/**
347 * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
348 * @lseg: the layout segment we're operating on
349 * @ds_idx: index of the DS to use
350 * @fail_return: return layout on connect failure?
351 *
352 * Try to prepare a DS connection to accept an RPC call. This involves
353 * selecting a mirror to use and connecting the client to it if it's not
354 * already connected.
355 *
356 * Since we only need a single functioning mirror to satisfy a read, we don't
357 * want to return the layout if there is one. For writes though, any down
358 * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
359 * between the two cases.
360 *
361 * Returns a pointer to a connected DS object on success or NULL on failure.
362 */
360struct nfs4_pnfs_ds * 363struct nfs4_pnfs_ds *
361nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, 364nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
362 bool fail_return) 365 bool fail_return)
@@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
367 struct inode *ino = lseg->pls_layout->plh_inode; 370 struct inode *ino = lseg->pls_layout->plh_inode;
368 struct nfs_server *s = NFS_SERVER(ino); 371 struct nfs_server *s = NFS_SERVER(ino);
369 unsigned int max_payload; 372 unsigned int max_payload;
370 rpc_authflavor_t flavor;
371 373
372 if (!ff_layout_mirror_valid(lseg, mirror)) { 374 if (!ff_layout_mirror_valid(lseg, mirror)) {
373 pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", 375 pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
@@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
383 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ 385 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
384 smp_rmb(); 386 smp_rmb();
385 if (ds->ds_clp) 387 if (ds->ds_clp)
386 goto out_update_creds; 388 goto out;
387
388 flavor = nfs4_ff_layout_choose_authflavor(mirror);
389 389
390 /* FIXME: For now we assume the server sent only one version of NFS 390 /* FIXME: For now we assume the server sent only one version of NFS
391 * to use for the DS. 391 * to use for the DS.
@@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
394 dataserver_retrans, 394 dataserver_retrans,
395 mirror->mirror_ds->ds_versions[0].version, 395 mirror->mirror_ds->ds_versions[0].version,
396 mirror->mirror_ds->ds_versions[0].minor_version, 396 mirror->mirror_ds->ds_versions[0].minor_version,
397 flavor); 397 RPC_AUTH_UNIX);
398 398
399 /* connect success, check rsize/wsize limit */ 399 /* connect success, check rsize/wsize limit */
400 if (ds->ds_clp) { 400 if (ds->ds_clp) {
@@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
410 mirror, lseg->pls_range.offset, 410 mirror, lseg->pls_range.offset,
411 lseg->pls_range.length, NFS4ERR_NXIO, 411 lseg->pls_range.length, NFS4ERR_NXIO,
412 OP_ILLEGAL, GFP_NOIO); 412 OP_ILLEGAL, GFP_NOIO);
413 if (!fail_return) { 413 if (fail_return || !ff_layout_has_available_ds(lseg))
414 if (ff_layout_has_available_ds(lseg))
415 set_bit(NFS_LAYOUT_RETURN_REQUESTED,
416 &lseg->pls_layout->plh_flags);
417 else
418 pnfs_error_mark_layout_for_return(ino, lseg);
419 } else
420 pnfs_error_mark_layout_for_return(ino, lseg); 414 pnfs_error_mark_layout_for_return(ino, lseg);
421 ds = NULL; 415 ds = NULL;
422 goto out;
423 } 416 }
424out_update_creds:
425 if (ff_layout_update_mirror_cred(mirror, ds))
426 ds = NULL;
427out: 417out:
428 return ds; 418 return ds;
429} 419}
@@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
433 struct rpc_cred *mdscred) 423 struct rpc_cred *mdscred)
434{ 424{
435 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); 425 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
436 struct rpc_cred *cred = ERR_PTR(-EINVAL); 426 struct rpc_cred *cred;
437
438 if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
439 goto out;
440 427
441 if (mirror && mirror->cred) 428 if (mirror) {
442 cred = mirror->cred; 429 cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
443 else 430 if (!cred)
444 cred = mdscred; 431 cred = get_rpccred(mdscred);
445out: 432 } else {
433 cred = get_rpccred(mdscred);
434 }
446 return cred; 435 return cred;
447} 436}
448 437
@@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
562 return ff_rw_layout_has_available_ds(lseg); 551 return ff_rw_layout_has_available_ds(lseg);
563} 552}
564 553
554bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
555{
556 return ff_layout_no_fallback_to_mds(lseg) ||
557 ff_layout_has_available_ds(lseg);
558}
559
560bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
561{
562 return lseg->pls_range.iomode == IOMODE_RW &&
563 ff_layout_no_read_on_rw(lseg);
564}
565
565module_param(dataserver_retrans, uint, 0644); 566module_param(dataserver_retrans, uint, 0644);
566MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " 567MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
567 "retries a request before it attempts further " 568 "retries a request before it attempts further "
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f1d1d2c472e9..5154fa65a2f2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
477 u32 ds_commit_idx); 477 u32 ds_commit_idx);
478int nfs_write_need_commit(struct nfs_pgio_header *); 478int nfs_write_need_commit(struct nfs_pgio_header *);
479void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); 479void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
480int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
480int nfs_generic_commit_list(struct inode *inode, struct list_head *head, 481int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
481 int how, struct nfs_commit_info *cinfo); 482 int how, struct nfs_commit_info *cinfo);
482void nfs_retry_commit(struct list_head *page_list, 483void nfs_retry_commit(struct list_head *page_list,
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b587ccd31083..b6cd15314bab 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -13,6 +13,7 @@
13 13
14/* nfs4.2proc.c */ 14/* nfs4.2proc.c */
15int nfs42_proc_allocate(struct file *, loff_t, loff_t); 15int nfs42_proc_allocate(struct file *, loff_t, loff_t);
16ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
16int nfs42_proc_deallocate(struct file *, loff_t, loff_t); 17int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
17loff_t nfs42_proc_llseek(struct file *, loff_t, int); 18loff_t nfs42_proc_llseek(struct file *, loff_t, int);
18int nfs42_proc_layoutstats_generic(struct nfs_server *, 19int nfs42_proc_layoutstats_generic(struct nfs_server *,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index dff83460e5a6..aa03ed09ba06 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
126 return err; 126 return err;
127} 127}
128 128
129static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
130 struct nfs_lock_context *src_lock,
131 struct file *dst, loff_t pos_dst,
132 struct nfs_lock_context *dst_lock,
133 size_t count)
134{
135 struct nfs42_copy_args args = {
136 .src_fh = NFS_FH(file_inode(src)),
137 .src_pos = pos_src,
138 .dst_fh = NFS_FH(file_inode(dst)),
139 .dst_pos = pos_dst,
140 .count = count,
141 };
142 struct nfs42_copy_res res;
143 struct rpc_message msg = {
144 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
145 .rpc_argp = &args,
146 .rpc_resp = &res,
147 };
148 struct inode *dst_inode = file_inode(dst);
149 struct nfs_server *server = NFS_SERVER(dst_inode);
150 int status;
151
152 status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
153 src_lock, FMODE_READ);
154 if (status)
155 return status;
156
157 status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
158 dst_lock, FMODE_WRITE);
159 if (status)
160 return status;
161
162 status = nfs4_call_sync(server->client, server, &msg,
163 &args.seq_args, &res.seq_res, 0);
164 if (status == -ENOTSUPP)
165 server->caps &= ~NFS_CAP_COPY;
166 if (status)
167 return status;
168
169 if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
170 status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
171 if (status)
172 return status;
173 }
174
175 truncate_pagecache_range(dst_inode, pos_dst,
176 pos_dst + res.write_res.count);
177
178 return res.write_res.count;
179}
180
181ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
182 struct file *dst, loff_t pos_dst,
183 size_t count)
184{
185 struct nfs_server *server = NFS_SERVER(file_inode(dst));
186 struct nfs_lock_context *src_lock;
187 struct nfs_lock_context *dst_lock;
188 struct nfs4_exception src_exception = { };
189 struct nfs4_exception dst_exception = { };
190 ssize_t err, err2;
191
192 if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
193 return -EOPNOTSUPP;
194
195 src_lock = nfs_get_lock_context(nfs_file_open_context(src));
196 if (IS_ERR(src_lock))
197 return PTR_ERR(src_lock);
198
199 src_exception.inode = file_inode(src);
200 src_exception.state = src_lock->open_context->state;
201
202 dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
203 if (IS_ERR(dst_lock)) {
204 err = PTR_ERR(dst_lock);
205 goto out_put_src_lock;
206 }
207
208 dst_exception.inode = file_inode(dst);
209 dst_exception.state = dst_lock->open_context->state;
210
211 do {
212 inode_lock(file_inode(dst));
213 err = _nfs42_proc_copy(src, pos_src, src_lock,
214 dst, pos_dst, dst_lock, count);
215 inode_unlock(file_inode(dst));
216
217 if (err == -ENOTSUPP) {
218 err = -EOPNOTSUPP;
219 break;
220 }
221
222 err2 = nfs4_handle_exception(server, err, &src_exception);
223 err = nfs4_handle_exception(server, err, &dst_exception);
224 if (!err)
225 err = err2;
226 } while (src_exception.retry || dst_exception.retry);
227
228 nfs_put_lock_context(dst_lock);
229out_put_src_lock:
230 nfs_put_lock_context(src_lock);
231 return err;
232}
233
129static loff_t _nfs42_proc_llseek(struct file *filep, 234static loff_t _nfs42_proc_llseek(struct file *filep,
130 struct nfs_lock_context *lock, loff_t offset, int whence) 235 struct nfs_lock_context *lock, loff_t offset, int whence)
131{ 236{
@@ -232,7 +337,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
232 * with the current stateid. 337 * with the current stateid.
233 */ 338 */
234 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 339 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
235 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); 340 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
236 spin_unlock(&inode->i_lock); 341 spin_unlock(&inode->i_lock);
237 pnfs_free_lseg_list(&head); 342 pnfs_free_lseg_list(&head);
238 } else 343 } else
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0ca482a51e53..6dc6f2aea0d6 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -9,9 +9,22 @@
9#define encode_fallocate_maxsz (encode_stateid_maxsz + \ 9#define encode_fallocate_maxsz (encode_stateid_maxsz + \
10 2 /* offset */ + \ 10 2 /* offset */ + \
11 2 /* length */) 11 2 /* length */)
12#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\
13 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
14 2 /* wr_count */ + \
15 1 /* wr_committed */ + \
16 XDR_QUADLEN(NFS4_VERIFIER_SIZE))
12#define encode_allocate_maxsz (op_encode_hdr_maxsz + \ 17#define encode_allocate_maxsz (op_encode_hdr_maxsz + \
13 encode_fallocate_maxsz) 18 encode_fallocate_maxsz)
14#define decode_allocate_maxsz (op_decode_hdr_maxsz) 19#define decode_allocate_maxsz (op_decode_hdr_maxsz)
20#define encode_copy_maxsz (op_encode_hdr_maxsz + \
21 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
22 XDR_QUADLEN(NFS4_STATEID_SIZE) + \
23 2 + 2 + 2 + 1 + 1 + 1)
24#define decode_copy_maxsz (op_decode_hdr_maxsz + \
25 NFS42_WRITE_RES_SIZE + \
26 1 /* cr_consecutive */ + \
27 1 /* cr_synchronous */)
15#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ 28#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
16 encode_fallocate_maxsz) 29 encode_fallocate_maxsz)
17#define decode_deallocate_maxsz (op_decode_hdr_maxsz) 30#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
@@ -49,6 +62,16 @@
49 decode_putfh_maxsz + \ 62 decode_putfh_maxsz + \
50 decode_allocate_maxsz + \ 63 decode_allocate_maxsz + \
51 decode_getattr_maxsz) 64 decode_getattr_maxsz)
65#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
66 encode_putfh_maxsz + \
67 encode_savefh_maxsz + \
68 encode_putfh_maxsz + \
69 encode_copy_maxsz)
70#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
71 decode_putfh_maxsz + \
72 decode_savefh_maxsz + \
73 decode_putfh_maxsz + \
74 decode_copy_maxsz)
52#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ 75#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
53 encode_putfh_maxsz + \ 76 encode_putfh_maxsz + \
54 encode_deallocate_maxsz + \ 77 encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
102 encode_fallocate(xdr, args); 125 encode_fallocate(xdr, args);
103} 126}
104 127
128static void encode_copy(struct xdr_stream *xdr,
129 struct nfs42_copy_args *args,
130 struct compound_hdr *hdr)
131{
132 encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
133 encode_nfs4_stateid(xdr, &args->src_stateid);
134 encode_nfs4_stateid(xdr, &args->dst_stateid);
135
136 encode_uint64(xdr, args->src_pos);
137 encode_uint64(xdr, args->dst_pos);
138 encode_uint64(xdr, args->count);
139
140 encode_uint32(xdr, 1); /* consecutive = true */
141 encode_uint32(xdr, 1); /* synchronous = true */
142 encode_uint32(xdr, 0); /* src server list */
143}
144
105static void encode_deallocate(struct xdr_stream *xdr, 145static void encode_deallocate(struct xdr_stream *xdr,
106 struct nfs42_falloc_args *args, 146 struct nfs42_falloc_args *args,
107 struct compound_hdr *hdr) 147 struct compound_hdr *hdr)
@@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
182} 222}
183 223
184/* 224/*
225 * Encode COPY request
226 */
227static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
228 struct xdr_stream *xdr,
229 struct nfs42_copy_args *args)
230{
231 struct compound_hdr hdr = {
232 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
233 };
234
235 encode_compound_hdr(xdr, req, &hdr);
236 encode_sequence(xdr, &args->seq_args, &hdr);
237 encode_putfh(xdr, args->src_fh, &hdr);
238 encode_savefh(xdr, &hdr);
239 encode_putfh(xdr, args->dst_fh, &hdr);
240 encode_copy(xdr, args, &hdr);
241 encode_nops(&hdr);
242}
243
244/*
185 * Encode DEALLOCATE request 245 * Encode DEALLOCATE request
186 */ 246 */
187static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, 247static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
@@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
266 return decode_op_hdr(xdr, OP_ALLOCATE); 326 return decode_op_hdr(xdr, OP_ALLOCATE);
267} 327}
268 328
329static int decode_write_response(struct xdr_stream *xdr,
330 struct nfs42_write_res *res)
331{
332 __be32 *p;
333 int stateids;
334
335 p = xdr_inline_decode(xdr, 4 + 8 + 4);
336 if (unlikely(!p))
337 goto out_overflow;
338
339 stateids = be32_to_cpup(p++);
340 p = xdr_decode_hyper(p, &res->count);
341 res->verifier.committed = be32_to_cpup(p);
342 return decode_verifier(xdr, &res->verifier.verifier);
343
344out_overflow:
345 print_overflow_msg(__func__, xdr);
346 return -EIO;
347}
348
349static int decode_copy_requirements(struct xdr_stream *xdr,
350 struct nfs42_copy_res *res) {
351 __be32 *p;
352
353 p = xdr_inline_decode(xdr, 4 + 4);
354 if (unlikely(!p))
355 goto out_overflow;
356
357 res->consecutive = be32_to_cpup(p++);
358 res->synchronous = be32_to_cpup(p++);
359 return 0;
360out_overflow:
361 print_overflow_msg(__func__, xdr);
362 return -EIO;
363}
364
365static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
366{
367 int status;
368
369 status = decode_op_hdr(xdr, OP_COPY);
370 if (status == NFS4ERR_OFFLOAD_NO_REQS) {
371 status = decode_copy_requirements(xdr, res);
372 if (status)
373 return status;
374 return NFS4ERR_OFFLOAD_NO_REQS;
375 } else if (status)
376 return status;
377
378 status = decode_write_response(xdr, &res->write_res);
379 if (status)
380 return status;
381
382 return decode_copy_requirements(xdr, res);
383}
384
269static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) 385static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
270{ 386{
271 return decode_op_hdr(xdr, OP_DEALLOCATE); 387 return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -331,6 +447,36 @@ out:
331} 447}
332 448
333/* 449/*
450 * Decode COPY response
451 */
452static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
453 struct xdr_stream *xdr,
454 struct nfs42_copy_res *res)
455{
456 struct compound_hdr hdr;
457 int status;
458
459 status = decode_compound_hdr(xdr, &hdr);
460 if (status)
461 goto out;
462 status = decode_sequence(xdr, &res->seq_res, rqstp);
463 if (status)
464 goto out;
465 status = decode_putfh(xdr);
466 if (status)
467 goto out;
468 status = decode_savefh(xdr);
469 if (status)
470 goto out;
471 status = decode_putfh(xdr);
472 if (status)
473 goto out;
474 status = decode_copy(xdr, res);
475out:
476 return status;
477}
478
479/*
334 * Decode DEALLOCATE request 480 * Decode DEALLOCATE request
335 */ 481 */
336static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, 482static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4afdee420d25..768456fa1b17 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -438,8 +438,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
438 struct nfs41_server_scope **); 438 struct nfs41_server_scope **);
439extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 439extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
440extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 440extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
441extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, 441extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
442 fmode_t, const struct nfs_lockowner *); 442 const struct nfs_lockowner *, nfs4_stateid *,
443 struct rpc_cred **);
443 444
444extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); 445extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
445extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 446extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -496,12 +497,15 @@ extern struct svc_version nfs4_callback_version4;
496 497
497static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) 498static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
498{ 499{
499 memcpy(dst, src, sizeof(*dst)); 500 memcpy(dst->data, src->data, sizeof(dst->data));
501 dst->type = src->type;
500} 502}
501 503
502static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) 504static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
503{ 505{
504 return memcmp(dst, src, sizeof(*dst)) == 0; 506 if (dst->type != src->type)
507 return false;
508 return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
505} 509}
506 510
507static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) 511static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index d0390516467c..014b0e41ace5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
129} 129}
130 130
131#ifdef CONFIG_NFS_V4_2 131#ifdef CONFIG_NFS_V4_2
132static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
133 struct file *file_out, loff_t pos_out,
134 size_t count, unsigned int flags)
135{
136 struct inode *in_inode = file_inode(file_in);
137 struct inode *out_inode = file_inode(file_out);
138 int ret;
139
140 if (in_inode == out_inode)
141 return -EINVAL;
142
143 /* flush any pending writes */
144 ret = nfs_sync_inode(in_inode);
145 if (ret)
146 return ret;
147 ret = nfs_sync_inode(out_inode);
148 if (ret)
149 return ret;
150
151 return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
152}
153
132static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) 154static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
133{ 155{
134 loff_t ret; 156 loff_t ret;
@@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = {
243 .check_flags = nfs_check_flags, 265 .check_flags = nfs_check_flags,
244 .setlease = simple_nosetlease, 266 .setlease = simple_nosetlease,
245#ifdef CONFIG_NFS_V4_2 267#ifdef CONFIG_NFS_V4_2
268 .copy_file_range = nfs4_copy_file_range,
246 .llseek = nfs4_file_llseek, 269 .llseek = nfs4_file_llseek,
247 .fallocate = nfs42_fallocate, 270 .fallocate = nfs42_fallocate,
248 .clone_file_range = nfs42_clone_file_range, 271 .clone_file_range = nfs42_clone_file_range,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 084e8570da18..223982eb38c9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -74,6 +74,17 @@
74#define NFS4_POLL_RETRY_MIN (HZ/10) 74#define NFS4_POLL_RETRY_MIN (HZ/10)
75#define NFS4_POLL_RETRY_MAX (15*HZ) 75#define NFS4_POLL_RETRY_MAX (15*HZ)
76 76
77/* file attributes which can be mapped to nfs attributes */
78#define NFS4_VALID_ATTRS (ATTR_MODE \
79 | ATTR_UID \
80 | ATTR_GID \
81 | ATTR_SIZE \
82 | ATTR_ATIME \
83 | ATTR_MTIME \
84 | ATTR_CTIME \
85 | ATTR_ATIME_SET \
86 | ATTR_MTIME_SET)
87
77struct nfs4_opendata; 88struct nfs4_opendata;
78static int _nfs4_proc_open(struct nfs4_opendata *data); 89static int _nfs4_proc_open(struct nfs4_opendata *data);
79static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 90static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -416,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
416 case -NFS4ERR_DELAY: 427 case -NFS4ERR_DELAY:
417 nfs_inc_server_stats(server, NFSIOS_DELAY); 428 nfs_inc_server_stats(server, NFSIOS_DELAY);
418 case -NFS4ERR_GRACE: 429 case -NFS4ERR_GRACE:
430 case -NFS4ERR_RECALLCONFLICT:
419 exception->delay = 1; 431 exception->delay = 1;
420 return 0; 432 return 0;
421 433
@@ -2558,15 +2570,20 @@ static int _nfs4_do_open(struct inode *dir,
2558 if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && 2570 if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
2559 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { 2571 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
2560 nfs4_exclusive_attrset(opendata, sattr, &label); 2572 nfs4_exclusive_attrset(opendata, sattr, &label);
2561 2573 /*
2562 nfs_fattr_init(opendata->o_res.f_attr); 2574 * send create attributes which was not set by open
2563 status = nfs4_do_setattr(state->inode, cred, 2575 * with an extra setattr.
2564 opendata->o_res.f_attr, sattr, 2576 */
2565 state, label, olabel); 2577 if (sattr->ia_valid & NFS4_VALID_ATTRS) {
2566 if (status == 0) { 2578 nfs_fattr_init(opendata->o_res.f_attr);
2567 nfs_setattr_update_inode(state->inode, sattr, 2579 status = nfs4_do_setattr(state->inode, cred,
2568 opendata->o_res.f_attr); 2580 opendata->o_res.f_attr, sattr,
2569 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); 2581 state, label, olabel);
2582 if (status == 0) {
2583 nfs_setattr_update_inode(state->inode, sattr,
2584 opendata->o_res.f_attr);
2585 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
2586 }
2570 } 2587 }
2571 } 2588 }
2572 if (opened && opendata->file_created) 2589 if (opened && opendata->file_created)
@@ -2676,6 +2693,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2676 .rpc_resp = &res, 2693 .rpc_resp = &res,
2677 .rpc_cred = cred, 2694 .rpc_cred = cred,
2678 }; 2695 };
2696 struct rpc_cred *delegation_cred = NULL;
2679 unsigned long timestamp = jiffies; 2697 unsigned long timestamp = jiffies;
2680 fmode_t fmode; 2698 fmode_t fmode;
2681 bool truncate; 2699 bool truncate;
@@ -2691,7 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2691 truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; 2709 truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
2692 fmode = truncate ? FMODE_WRITE : FMODE_READ; 2710 fmode = truncate ? FMODE_WRITE : FMODE_READ;
2693 2711
2694 if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { 2712 if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
2695 /* Use that stateid */ 2713 /* Use that stateid */
2696 } else if (truncate && state != NULL) { 2714 } else if (truncate && state != NULL) {
2697 struct nfs_lockowner lockowner = { 2715 struct nfs_lockowner lockowner = {
@@ -2700,13 +2718,17 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2700 }; 2718 };
2701 if (!nfs4_valid_open_stateid(state)) 2719 if (!nfs4_valid_open_stateid(state))
2702 return -EBADF; 2720 return -EBADF;
2703 if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, 2721 if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
2704 &lockowner) == -EIO) 2722 &arg.stateid, &delegation_cred) == -EIO)
2705 return -EBADF; 2723 return -EBADF;
2706 } else 2724 } else
2707 nfs4_stateid_copy(&arg.stateid, &zero_stateid); 2725 nfs4_stateid_copy(&arg.stateid, &zero_stateid);
2726 if (delegation_cred)
2727 msg.rpc_cred = delegation_cred;
2708 2728
2709 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 2729 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
2730
2731 put_rpccred(delegation_cred);
2710 if (status == 0 && state != NULL) 2732 if (status == 0 && state != NULL)
2711 renew_lease(server, timestamp); 2733 renew_lease(server, timestamp);
2712 trace_nfs4_setattr(inode, &arg.stateid, status); 2734 trace_nfs4_setattr(inode, &arg.stateid, status);
@@ -4285,7 +4307,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
4285 4307
4286 if (l_ctx != NULL) 4308 if (l_ctx != NULL)
4287 lockowner = &l_ctx->lockowner; 4309 lockowner = &l_ctx->lockowner;
4288 return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); 4310 return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
4289} 4311}
4290EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); 4312EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
4291 4313
@@ -6054,6 +6076,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
6054static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 6076static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
6055{ 6077{
6056 struct nfs_inode *nfsi = NFS_I(state->inode); 6078 struct nfs_inode *nfsi = NFS_I(state->inode);
6079 struct nfs4_state_owner *sp = state->owner;
6057 unsigned char fl_flags = request->fl_flags; 6080 unsigned char fl_flags = request->fl_flags;
6058 int status = -ENOLCK; 6081 int status = -ENOLCK;
6059 6082
@@ -6068,6 +6091,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
6068 status = do_vfs_lock(state->inode, request); 6091 status = do_vfs_lock(state->inode, request);
6069 if (status < 0) 6092 if (status < 0)
6070 goto out; 6093 goto out;
6094 mutex_lock(&sp->so_delegreturn_mutex);
6071 down_read(&nfsi->rwsem); 6095 down_read(&nfsi->rwsem);
6072 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { 6096 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
6073 /* Yes: cache locks! */ 6097 /* Yes: cache locks! */
@@ -6075,9 +6099,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
6075 request->fl_flags = fl_flags & ~FL_SLEEP; 6099 request->fl_flags = fl_flags & ~FL_SLEEP;
6076 status = do_vfs_lock(state->inode, request); 6100 status = do_vfs_lock(state->inode, request);
6077 up_read(&nfsi->rwsem); 6101 up_read(&nfsi->rwsem);
6102 mutex_unlock(&sp->so_delegreturn_mutex);
6078 goto out; 6103 goto out;
6079 } 6104 }
6080 up_read(&nfsi->rwsem); 6105 up_read(&nfsi->rwsem);
6106 mutex_unlock(&sp->so_delegreturn_mutex);
6081 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); 6107 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
6082out: 6108out:
6083 request->fl_flags = fl_flags; 6109 request->fl_flags = fl_flags;
@@ -7351,9 +7377,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
7351 * always set csa_cachethis to FALSE because the current implementation 7377 * always set csa_cachethis to FALSE because the current implementation
7352 * of the back channel DRC only supports caching the CB_SEQUENCE operation. 7378 * of the back channel DRC only supports caching the CB_SEQUENCE operation.
7353 */ 7379 */
7354static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) 7380static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
7381 struct rpc_clnt *clnt)
7355{ 7382{
7356 unsigned int max_rqst_sz, max_resp_sz; 7383 unsigned int max_rqst_sz, max_resp_sz;
7384 unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
7357 7385
7358 max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; 7386 max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
7359 max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; 7387 max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7371,8 +7399,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
7371 args->fc_attrs.max_ops, args->fc_attrs.max_reqs); 7399 args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
7372 7400
7373 /* Back channel attributes */ 7401 /* Back channel attributes */
7374 args->bc_attrs.max_rqst_sz = PAGE_SIZE; 7402 args->bc_attrs.max_rqst_sz = max_bc_payload;
7375 args->bc_attrs.max_resp_sz = PAGE_SIZE; 7403 args->bc_attrs.max_resp_sz = max_bc_payload;
7376 args->bc_attrs.max_resp_sz_cached = 0; 7404 args->bc_attrs.max_resp_sz_cached = 0;
7377 args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; 7405 args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
7378 args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; 7406 args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7476,7 +7504,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
7476 }; 7504 };
7477 int status; 7505 int status;
7478 7506
7479 nfs4_init_channel_attrs(&args); 7507 nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
7480 args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); 7508 args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
7481 7509
7482 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 7510 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -7820,40 +7848,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
7820 struct nfs4_layoutget *lgp = calldata; 7848 struct nfs4_layoutget *lgp = calldata;
7821 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 7849 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
7822 struct nfs4_session *session = nfs4_get_session(server); 7850 struct nfs4_session *session = nfs4_get_session(server);
7823 int ret;
7824 7851
7825 dprintk("--> %s\n", __func__); 7852 dprintk("--> %s\n", __func__);
7826 /* Note the is a race here, where a CB_LAYOUTRECALL can come in 7853 nfs41_setup_sequence(session, &lgp->args.seq_args,
7827 * right now covering the LAYOUTGET we are about to send. 7854 &lgp->res.seq_res, task);
7828 * However, that is not so catastrophic, and there seems 7855 dprintk("<-- %s\n", __func__);
7829 * to be no way to prevent it completely.
7830 */
7831 if (nfs41_setup_sequence(session, &lgp->args.seq_args,
7832 &lgp->res.seq_res, task))
7833 return;
7834 ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
7835 NFS_I(lgp->args.inode)->layout,
7836 &lgp->args.range,
7837 lgp->args.ctx->state);
7838 if (ret < 0)
7839 rpc_exit(task, ret);
7840} 7856}
7841 7857
7842static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) 7858static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7843{ 7859{
7844 struct nfs4_layoutget *lgp = calldata; 7860 struct nfs4_layoutget *lgp = calldata;
7861
7862 dprintk("--> %s\n", __func__);
7863 nfs41_sequence_done(task, &lgp->res.seq_res);
7864 dprintk("<-- %s\n", __func__);
7865}
7866
7867static int
7868nfs4_layoutget_handle_exception(struct rpc_task *task,
7869 struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
7870{
7845 struct inode *inode = lgp->args.inode; 7871 struct inode *inode = lgp->args.inode;
7846 struct nfs_server *server = NFS_SERVER(inode); 7872 struct nfs_server *server = NFS_SERVER(inode);
7847 struct pnfs_layout_hdr *lo; 7873 struct pnfs_layout_hdr *lo;
7848 struct nfs4_state *state = NULL; 7874 int status = task->tk_status;
7849 unsigned long timeo, now, giveup;
7850 7875
7851 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); 7876 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
7852 7877
7853 if (!nfs41_sequence_done(task, &lgp->res.seq_res)) 7878 switch (status) {
7854 goto out;
7855
7856 switch (task->tk_status) {
7857 case 0: 7879 case 0:
7858 goto out; 7880 goto out;
7859 7881
@@ -7863,57 +7885,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7863 * retry go inband. 7885 * retry go inband.
7864 */ 7886 */
7865 case -NFS4ERR_LAYOUTUNAVAILABLE: 7887 case -NFS4ERR_LAYOUTUNAVAILABLE:
7866 task->tk_status = -ENODATA; 7888 status = -ENODATA;
7867 goto out; 7889 goto out;
7868 /* 7890 /*
7869 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of 7891 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
7870 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). 7892 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
7871 */ 7893 */
7872 case -NFS4ERR_BADLAYOUT: 7894 case -NFS4ERR_BADLAYOUT:
7873 goto out_overflow; 7895 status = -EOVERFLOW;
7896 goto out;
7874 /* 7897 /*
7875 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client 7898 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
7876 * (or clients) writing to the same RAID stripe except when 7899 * (or clients) writing to the same RAID stripe except when
7877 * the minlength argument is 0 (see RFC5661 section 18.43.3). 7900 * the minlength argument is 0 (see RFC5661 section 18.43.3).
7901 *
7902 * Treat it like we would RECALLCONFLICT -- we retry for a little
7903 * while, and then eventually give up.
7878 */ 7904 */
7879 case -NFS4ERR_LAYOUTTRYLATER: 7905 case -NFS4ERR_LAYOUTTRYLATER:
7880 if (lgp->args.minlength == 0) 7906 if (lgp->args.minlength == 0) {
7881 goto out_overflow; 7907 status = -EOVERFLOW;
7882 /* 7908 goto out;
7883 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
7884 * existing layout before getting a new one).
7885 */
7886 case -NFS4ERR_RECALLCONFLICT:
7887 timeo = rpc_get_timeout(task->tk_client);
7888 giveup = lgp->args.timestamp + timeo;
7889 now = jiffies;
7890 if (time_after(giveup, now)) {
7891 unsigned long delay;
7892
7893 /* Delay for:
7894 * - Not less then NFS4_POLL_RETRY_MIN.
7895 * - One last time a jiffie before we give up
7896 * - exponential backoff (time_now minus start_attempt)
7897 */
7898 delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
7899 min((giveup - now - 1),
7900 now - lgp->args.timestamp));
7901
7902 dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
7903 __func__, delay);
7904 rpc_delay(task, delay);
7905 /* Do not call nfs4_async_handle_error() */
7906 goto out_restart;
7907 } 7909 }
7908 break; 7910 /* Fallthrough */
7911 case -NFS4ERR_RECALLCONFLICT:
7912 nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
7913 exception);
7914 status = -ERECALLCONFLICT;
7915 goto out;
7909 case -NFS4ERR_EXPIRED: 7916 case -NFS4ERR_EXPIRED:
7910 case -NFS4ERR_BAD_STATEID: 7917 case -NFS4ERR_BAD_STATEID:
7918 exception->timeout = 0;
7911 spin_lock(&inode->i_lock); 7919 spin_lock(&inode->i_lock);
7912 if (nfs4_stateid_match(&lgp->args.stateid, 7920 if (nfs4_stateid_match(&lgp->args.stateid,
7913 &lgp->args.ctx->state->stateid)) { 7921 &lgp->args.ctx->state->stateid)) {
7914 spin_unlock(&inode->i_lock); 7922 spin_unlock(&inode->i_lock);
7915 /* If the open stateid was bad, then recover it. */ 7923 /* If the open stateid was bad, then recover it. */
7916 state = lgp->args.ctx->state; 7924 exception->state = lgp->args.ctx->state;
7917 break; 7925 break;
7918 } 7926 }
7919 lo = NFS_I(inode)->layout; 7927 lo = NFS_I(inode)->layout;
@@ -7926,25 +7934,21 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7926 * with the current stateid. 7934 * with the current stateid.
7927 */ 7935 */
7928 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 7936 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
7929 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); 7937 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
7930 spin_unlock(&inode->i_lock); 7938 spin_unlock(&inode->i_lock);
7931 pnfs_free_lseg_list(&head); 7939 pnfs_free_lseg_list(&head);
7932 } else 7940 } else
7933 spin_unlock(&inode->i_lock); 7941 spin_unlock(&inode->i_lock);
7934 goto out_restart; 7942 status = -EAGAIN;
7943 goto out;
7935 } 7944 }
7936 if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) 7945
7937 goto out_restart; 7946 status = nfs4_handle_exception(server, status, exception);
7947 if (exception->retry)
7948 status = -EAGAIN;
7938out: 7949out:
7939 dprintk("<-- %s\n", __func__); 7950 dprintk("<-- %s\n", __func__);
7940 return; 7951 return status;
7941out_restart:
7942 task->tk_status = 0;
7943 rpc_restart_call_prepare(task);
7944 return;
7945out_overflow:
7946 task->tk_status = -EOVERFLOW;
7947 goto out;
7948} 7952}
7949 7953
7950static size_t max_response_pages(struct nfs_server *server) 7954static size_t max_response_pages(struct nfs_server *server)
@@ -8013,7 +8017,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
8013}; 8017};
8014 8018
8015struct pnfs_layout_segment * 8019struct pnfs_layout_segment *
8016nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) 8020nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
8017{ 8021{
8018 struct inode *inode = lgp->args.inode; 8022 struct inode *inode = lgp->args.inode;
8019 struct nfs_server *server = NFS_SERVER(inode); 8023 struct nfs_server *server = NFS_SERVER(inode);
@@ -8033,6 +8037,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
8033 .flags = RPC_TASK_ASYNC, 8037 .flags = RPC_TASK_ASYNC,
8034 }; 8038 };
8035 struct pnfs_layout_segment *lseg = NULL; 8039 struct pnfs_layout_segment *lseg = NULL;
8040 struct nfs4_exception exception = { .timeout = *timeout };
8036 int status = 0; 8041 int status = 0;
8037 8042
8038 dprintk("--> %s\n", __func__); 8043 dprintk("--> %s\n", __func__);
@@ -8046,7 +8051,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
8046 return ERR_PTR(-ENOMEM); 8051 return ERR_PTR(-ENOMEM);
8047 } 8052 }
8048 lgp->args.layout.pglen = max_pages * PAGE_SIZE; 8053 lgp->args.layout.pglen = max_pages * PAGE_SIZE;
8049 lgp->args.timestamp = jiffies;
8050 8054
8051 lgp->res.layoutp = &lgp->args.layout; 8055 lgp->res.layoutp = &lgp->args.layout;
8052 lgp->res.seq_res.sr_slot = NULL; 8056 lgp->res.seq_res.sr_slot = NULL;
@@ -8056,13 +8060,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
8056 if (IS_ERR(task)) 8060 if (IS_ERR(task))
8057 return ERR_CAST(task); 8061 return ERR_CAST(task);
8058 status = nfs4_wait_for_completion_rpc_task(task); 8062 status = nfs4_wait_for_completion_rpc_task(task);
8059 if (status == 0) 8063 if (status == 0) {
8060 status = task->tk_status; 8064 status = nfs4_layoutget_handle_exception(task, lgp, &exception);
8065 *timeout = exception.timeout;
8066 }
8067
8061 trace_nfs4_layoutget(lgp->args.ctx, 8068 trace_nfs4_layoutget(lgp->args.ctx,
8062 &lgp->args.range, 8069 &lgp->args.range,
8063 &lgp->res.range, 8070 &lgp->res.range,
8064 &lgp->res.stateid, 8071 &lgp->res.stateid,
8065 status); 8072 status);
8073
8066 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ 8074 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
8067 if (status == 0 && lgp->res.layoutp->len) 8075 if (status == 0 && lgp->res.layoutp->len)
8068 lseg = pnfs_layout_process(lgp); 8076 lseg = pnfs_layout_process(lgp);
@@ -8118,7 +8126,8 @@ static void nfs4_layoutreturn_release(void *calldata)
8118 8126
8119 dprintk("--> %s\n", __func__); 8127 dprintk("--> %s\n", __func__);
8120 spin_lock(&lo->plh_inode->i_lock); 8128 spin_lock(&lo->plh_inode->i_lock);
8121 pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); 8129 pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
8130 be32_to_cpu(lrp->args.stateid.seqid));
8122 pnfs_mark_layout_returned_if_empty(lo); 8131 pnfs_mark_layout_returned_if_empty(lo);
8123 if (lrp->res.lrs_present) 8132 if (lrp->res.lrs_present)
8124 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); 8133 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
@@ -8653,6 +8662,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
8653static bool nfs41_match_stateid(const nfs4_stateid *s1, 8662static bool nfs41_match_stateid(const nfs4_stateid *s1,
8654 const nfs4_stateid *s2) 8663 const nfs4_stateid *s2)
8655{ 8664{
8665 if (s1->type != s2->type)
8666 return false;
8667
8656 if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) 8668 if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
8657 return false; 8669 return false;
8658 8670
@@ -8793,6 +8805,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
8793 | NFS_CAP_STATEID_NFSV41 8805 | NFS_CAP_STATEID_NFSV41
8794 | NFS_CAP_ATOMIC_OPEN_V1 8806 | NFS_CAP_ATOMIC_OPEN_V1
8795 | NFS_CAP_ALLOCATE 8807 | NFS_CAP_ALLOCATE
8808 | NFS_CAP_COPY
8796 | NFS_CAP_DEALLOCATE 8809 | NFS_CAP_DEALLOCATE
8797 | NFS_CAP_SEEK 8810 | NFS_CAP_SEEK
8798 | NFS_CAP_LAYOUTSTATS 8811 | NFS_CAP_LAYOUTSTATS
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d854693a15b0..5075592df145 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -65,7 +65,10 @@
65 65
66#define OPENOWNER_POOL_SIZE 8 66#define OPENOWNER_POOL_SIZE 8
67 67
68const nfs4_stateid zero_stateid; 68const nfs4_stateid zero_stateid = {
69 .data = { 0 },
70 .type = NFS4_SPECIAL_STATEID_TYPE,
71};
69static DEFINE_MUTEX(nfs_clid_init_mutex); 72static DEFINE_MUTEX(nfs_clid_init_mutex);
70 73
71int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 74int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
985 * Byte-range lock aware utility to initialize the stateid of read/write 988 * Byte-range lock aware utility to initialize the stateid of read/write
986 * requests. 989 * requests.
987 */ 990 */
988int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, 991int nfs4_select_rw_stateid(struct nfs4_state *state,
989 fmode_t fmode, const struct nfs_lockowner *lockowner) 992 fmode_t fmode, const struct nfs_lockowner *lockowner,
993 nfs4_stateid *dst, struct rpc_cred **cred)
990{ 994{
991 int ret = nfs4_copy_lock_stateid(dst, state, lockowner); 995 int ret;
996
997 if (cred != NULL)
998 *cred = NULL;
999 ret = nfs4_copy_lock_stateid(dst, state, lockowner);
992 if (ret == -EIO) 1000 if (ret == -EIO)
993 /* A lost lock - don't even consider delegations */ 1001 /* A lost lock - don't even consider delegations */
994 goto out; 1002 goto out;
995 /* returns true if delegation stateid found and copied */ 1003 /* returns true if delegation stateid found and copied */
996 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { 1004 if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) {
997 ret = 0; 1005 ret = 0;
998 goto out; 1006 goto out;
999 } 1007 }
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2c8d05dae5b1..9c150b153782 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
1520 { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ 1520 { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
1521 { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ 1521 { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
1522 { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ 1522 { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
1523 { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \
1524 { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \
1523 { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) 1525 { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
1524 1526
1525TRACE_EVENT(pnfs_update_layout, 1527TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
1528 u64 count, 1530 u64 count,
1529 enum pnfs_iomode iomode, 1531 enum pnfs_iomode iomode,
1530 struct pnfs_layout_hdr *lo, 1532 struct pnfs_layout_hdr *lo,
1533 struct pnfs_layout_segment *lseg,
1531 enum pnfs_update_layout_reason reason 1534 enum pnfs_update_layout_reason reason
1532 ), 1535 ),
1533 TP_ARGS(inode, pos, count, iomode, lo, reason), 1536 TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
1534 TP_STRUCT__entry( 1537 TP_STRUCT__entry(
1535 __field(dev_t, dev) 1538 __field(dev_t, dev)
1536 __field(u64, fileid) 1539 __field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
1540 __field(enum pnfs_iomode, iomode) 1543 __field(enum pnfs_iomode, iomode)
1541 __field(int, layoutstateid_seq) 1544 __field(int, layoutstateid_seq)
1542 __field(u32, layoutstateid_hash) 1545 __field(u32, layoutstateid_hash)
1546 __field(long, lseg)
1543 __field(enum pnfs_update_layout_reason, reason) 1547 __field(enum pnfs_update_layout_reason, reason)
1544 ), 1548 ),
1545 TP_fast_assign( 1549 TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
1559 __entry->layoutstateid_seq = 0; 1563 __entry->layoutstateid_seq = 0;
1560 __entry->layoutstateid_hash = 0; 1564 __entry->layoutstateid_hash = 0;
1561 } 1565 }
1566 __entry->lseg = (long)lseg;
1562 ), 1567 ),
1563 TP_printk( 1568 TP_printk(
1564 "fileid=%02x:%02x:%llu fhandle=0x%08x " 1569 "fileid=%02x:%02x:%llu fhandle=0x%08x "
1565 "iomode=%s pos=%llu count=%llu " 1570 "iomode=%s pos=%llu count=%llu "
1566 "layoutstateid=%d:0x%08x (%s)", 1571 "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
1567 MAJOR(__entry->dev), MINOR(__entry->dev), 1572 MAJOR(__entry->dev), MINOR(__entry->dev),
1568 (unsigned long long)__entry->fileid, 1573 (unsigned long long)__entry->fileid,
1569 __entry->fhandle, 1574 __entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
1571 (unsigned long long)__entry->pos, 1576 (unsigned long long)__entry->pos,
1572 (unsigned long long)__entry->count, 1577 (unsigned long long)__entry->count,
1573 __entry->layoutstateid_seq, __entry->layoutstateid_hash, 1578 __entry->layoutstateid_seq, __entry->layoutstateid_hash,
1579 __entry->lseg,
1574 show_pnfs_update_layout_reason(__entry->reason) 1580 show_pnfs_update_layout_reason(__entry->reason)
1575 ) 1581 )
1576); 1582);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 88474a4fc669..661e753fe1c9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
4270 return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); 4270 return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
4271} 4271}
4272 4272
4273static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
4274{
4275 stateid->type = NFS4_OPEN_STATEID_TYPE;
4276 return decode_stateid(xdr, stateid);
4277}
4278
4279static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
4280{
4281 stateid->type = NFS4_LOCK_STATEID_TYPE;
4282 return decode_stateid(xdr, stateid);
4283}
4284
4285static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
4286{
4287 stateid->type = NFS4_DELEGATION_STATEID_TYPE;
4288 return decode_stateid(xdr, stateid);
4289}
4290
4273static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) 4291static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
4274{ 4292{
4275 int status; 4293 int status;
@@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
4278 if (status != -EIO) 4296 if (status != -EIO)
4279 nfs_increment_open_seqid(status, res->seqid); 4297 nfs_increment_open_seqid(status, res->seqid);
4280 if (!status) 4298 if (!status)
4281 status = decode_stateid(xdr, &res->stateid); 4299 status = decode_open_stateid(xdr, &res->stateid);
4282 return status; 4300 return status;
4283} 4301}
4284 4302
@@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
4937 if (status == -EIO) 4955 if (status == -EIO)
4938 goto out; 4956 goto out;
4939 if (status == 0) { 4957 if (status == 0) {
4940 status = decode_stateid(xdr, &res->stateid); 4958 status = decode_lock_stateid(xdr, &res->stateid);
4941 if (unlikely(status)) 4959 if (unlikely(status))
4942 goto out; 4960 goto out;
4943 } else if (status == -NFS4ERR_DENIED) 4961 } else if (status == -NFS4ERR_DENIED)
@@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
4966 if (status != -EIO) 4984 if (status != -EIO)
4967 nfs_increment_lock_seqid(status, res->seqid); 4985 nfs_increment_lock_seqid(status, res->seqid);
4968 if (status == 0) 4986 if (status == 0)
4969 status = decode_stateid(xdr, &res->stateid); 4987 status = decode_lock_stateid(xdr, &res->stateid);
4970 return status; 4988 return status;
4971} 4989}
4972 4990
@@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
5016 __be32 *p; 5034 __be32 *p;
5017 int status; 5035 int status;
5018 5036
5019 status = decode_stateid(xdr, &res->delegation); 5037 status = decode_delegation_stateid(xdr, &res->delegation);
5020 if (unlikely(status)) 5038 if (unlikely(status))
5021 return status; 5039 return status;
5022 p = xdr_inline_decode(xdr, 4); 5040 p = xdr_inline_decode(xdr, 4);
@@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
5096 nfs_increment_open_seqid(status, res->seqid); 5114 nfs_increment_open_seqid(status, res->seqid);
5097 if (status) 5115 if (status)
5098 return status; 5116 return status;
5099 status = decode_stateid(xdr, &res->stateid); 5117 status = decode_open_stateid(xdr, &res->stateid);
5100 if (unlikely(status)) 5118 if (unlikely(status))
5101 return status; 5119 return status;
5102 5120
@@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
5136 if (status != -EIO) 5154 if (status != -EIO)
5137 nfs_increment_open_seqid(status, res->seqid); 5155 nfs_increment_open_seqid(status, res->seqid);
5138 if (!status) 5156 if (!status)
5139 status = decode_stateid(xdr, &res->stateid); 5157 status = decode_open_stateid(xdr, &res->stateid);
5140 return status; 5158 return status;
5141} 5159}
5142 5160
@@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
5148 if (status != -EIO) 5166 if (status != -EIO)
5149 nfs_increment_open_seqid(status, res->seqid); 5167 nfs_increment_open_seqid(status, res->seqid);
5150 if (!status) 5168 if (!status)
5151 status = decode_stateid(xdr, &res->stateid); 5169 status = decode_open_stateid(xdr, &res->stateid);
5152 return status; 5170 return status;
5153} 5171}
5154 5172
@@ -5838,6 +5856,12 @@ out_overflow:
5838} 5856}
5839 5857
5840#if defined(CONFIG_NFS_V4_1) 5858#if defined(CONFIG_NFS_V4_1)
5859static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
5860{
5861 stateid->type = NFS4_LAYOUT_STATEID_TYPE;
5862 return decode_stateid(xdr, stateid);
5863}
5864
5841static int decode_getdeviceinfo(struct xdr_stream *xdr, 5865static int decode_getdeviceinfo(struct xdr_stream *xdr,
5842 struct nfs4_getdeviceinfo_res *res) 5866 struct nfs4_getdeviceinfo_res *res)
5843{ 5867{
@@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
5919 if (unlikely(!p)) 5943 if (unlikely(!p))
5920 goto out_overflow; 5944 goto out_overflow;
5921 res->return_on_close = be32_to_cpup(p); 5945 res->return_on_close = be32_to_cpup(p);
5922 decode_stateid(xdr, &res->stateid); 5946 decode_layout_stateid(xdr, &res->stateid);
5923 p = xdr_inline_decode(xdr, 4); 5947 p = xdr_inline_decode(xdr, 4);
5924 if (unlikely(!p)) 5948 if (unlikely(!p))
5925 goto out_overflow; 5949 goto out_overflow;
@@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
5985 goto out_overflow; 6009 goto out_overflow;
5986 res->lrs_present = be32_to_cpup(p); 6010 res->lrs_present = be32_to_cpup(p);
5987 if (res->lrs_present) 6011 if (res->lrs_present)
5988 status = decode_stateid(xdr, &res->stateid); 6012 status = decode_layout_stateid(xdr, &res->stateid);
5989 return status; 6013 return status;
5990out_overflow: 6014out_overflow:
5991 print_overflow_msg(__func__, xdr); 6015 print_overflow_msg(__func__, xdr);
@@ -7515,6 +7539,7 @@ struct rpc_procinfo nfs4_procedures[] = {
7515 PROC(DEALLOCATE, enc_deallocate, dec_deallocate), 7539 PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
7516 PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), 7540 PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
7517 PROC(CLONE, enc_clone, dec_clone), 7541 PROC(CLONE, enc_clone, dec_clone),
7542 PROC(COPY, enc_copy, dec_copy),
7518#endif /* CONFIG_NFS_V4_2 */ 7543#endif /* CONFIG_NFS_V4_2 */
7519}; 7544};
7520 7545
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1f6db4231057..174dd4cf5747 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
341 * long write-back delay. This will be adjusted in 341 * long write-back delay. This will be adjusted in
342 * update_nfs_request below if the region is not locked. */ 342 * update_nfs_request below if the region is not locked. */
343 req->wb_page = page; 343 req->wb_page = page;
344 req->wb_index = page_file_index(page); 344 if (page) {
345 get_page(page); 345 req->wb_index = page_file_index(page);
346 get_page(page);
347 }
346 req->wb_offset = offset; 348 req->wb_offset = offset;
347 req->wb_pgbase = offset; 349 req->wb_pgbase = offset;
348 req->wb_bytes = count; 350 req->wb_bytes = count;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 89a5ef4df08a..0c7e0d45a4de 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
270 }; 270 };
271 271
272 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 272 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
273 return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); 273 return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
274} 274}
275 275
276static int 276static int
@@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
308 308
309 spin_lock(&inode->i_lock); 309 spin_lock(&inode->i_lock);
310 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 310 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
311 pnfs_mark_matching_lsegs_invalid(lo, &head, &range); 311 pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
312 spin_unlock(&inode->i_lock); 312 spin_unlock(&inode->i_lock);
313 pnfs_free_lseg_list(&head); 313 pnfs_free_lseg_list(&head);
314 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, 314 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
@@ -522,13 +522,35 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
522 return rv; 522 return rv;
523} 523}
524 524
525/* Returns count of number of matching invalid lsegs remaining in list 525/*
526 * after call. 526 * Compare 2 layout stateid sequence ids, to see which is newer,
527 * taking into account wraparound issues.
528 */
529static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
530{
531 return (s32)(s1 - s2) > 0;
532}
533
534/**
535 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
536 * @lo: layout header containing the lsegs
537 * @tmp_list: list head where doomed lsegs should go
538 * @recall_range: optional recall range argument to match (may be NULL)
539 * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
540 *
541 * Walk the list of lsegs in the layout header, and tear down any that should
542 * be destroyed. If "recall_range" is specified then the segment must match
543 * that range. If "seq" is non-zero, then only match segments that were handed
544 * out at or before that sequence.
545 *
546 * Returns number of matching invalid lsegs remaining in list after scanning
547 * it and purging them.
527 */ 548 */
528int 549int
529pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 550pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
530 struct list_head *tmp_list, 551 struct list_head *tmp_list,
531 const struct pnfs_layout_range *recall_range) 552 const struct pnfs_layout_range *recall_range,
553 u32 seq)
532{ 554{
533 struct pnfs_layout_segment *lseg, *next; 555 struct pnfs_layout_segment *lseg, *next;
534 int remaining = 0; 556 int remaining = 0;
@@ -540,10 +562,12 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
540 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 562 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
541 if (!recall_range || 563 if (!recall_range ||
542 should_free_lseg(&lseg->pls_range, recall_range)) { 564 should_free_lseg(&lseg->pls_range, recall_range)) {
543 dprintk("%s: freeing lseg %p iomode %d " 565 if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
566 continue;
567 dprintk("%s: freeing lseg %p iomode %d seq %u"
544 "offset %llu length %llu\n", __func__, 568 "offset %llu length %llu\n", __func__,
545 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 569 lseg, lseg->pls_range.iomode, lseg->pls_seq,
546 lseg->pls_range.length); 570 lseg->pls_range.offset, lseg->pls_range.length);
547 if (!mark_lseg_invalid(lseg, tmp_list)) 571 if (!mark_lseg_invalid(lseg, tmp_list))
548 remaining++; 572 remaining++;
549 } 573 }
@@ -730,15 +754,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
730 pnfs_destroy_layouts_byclid(clp, false); 754 pnfs_destroy_layouts_byclid(clp, false);
731} 755}
732 756
733/*
734 * Compare 2 layout stateid sequence ids, to see which is newer,
735 * taking into account wraparound issues.
736 */
737static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
738{
739 return (s32)(s1 - s2) > 0;
740}
741
742/* update lo->plh_stateid with new if is more recent */ 757/* update lo->plh_stateid with new if is more recent */
743void 758void
744pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 759pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -781,50 +796,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
781 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 796 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
782} 797}
783 798
784int
785pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
786 const struct pnfs_layout_range *range,
787 struct nfs4_state *open_state)
788{
789 int status = 0;
790
791 dprintk("--> %s\n", __func__);
792 spin_lock(&lo->plh_inode->i_lock);
793 if (pnfs_layoutgets_blocked(lo)) {
794 status = -EAGAIN;
795 } else if (!nfs4_valid_open_stateid(open_state)) {
796 status = -EBADF;
797 } else if (list_empty(&lo->plh_segs) ||
798 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
799 int seq;
800
801 do {
802 seq = read_seqbegin(&open_state->seqlock);
803 nfs4_stateid_copy(dst, &open_state->stateid);
804 } while (read_seqretry(&open_state->seqlock, seq));
805 } else
806 nfs4_stateid_copy(dst, &lo->plh_stateid);
807 spin_unlock(&lo->plh_inode->i_lock);
808 dprintk("<-- %s\n", __func__);
809 return status;
810}
811
812/* 799/*
813* Get layout from server. 800 * Get layout from server.
814* for now, assume that whole file layouts are requested. 801 * for now, assume that whole file layouts are requested.
815* arg->offset: 0 802 * arg->offset: 0
816* arg->length: all ones 803 * arg->length: all ones
817*/ 804 */
818static struct pnfs_layout_segment * 805static struct pnfs_layout_segment *
819send_layoutget(struct pnfs_layout_hdr *lo, 806send_layoutget(struct pnfs_layout_hdr *lo,
820 struct nfs_open_context *ctx, 807 struct nfs_open_context *ctx,
808 nfs4_stateid *stateid,
821 const struct pnfs_layout_range *range, 809 const struct pnfs_layout_range *range,
822 gfp_t gfp_flags) 810 long *timeout, gfp_t gfp_flags)
823{ 811{
824 struct inode *ino = lo->plh_inode; 812 struct inode *ino = lo->plh_inode;
825 struct nfs_server *server = NFS_SERVER(ino); 813 struct nfs_server *server = NFS_SERVER(ino);
826 struct nfs4_layoutget *lgp; 814 struct nfs4_layoutget *lgp;
827 struct pnfs_layout_segment *lseg;
828 loff_t i_size; 815 loff_t i_size;
829 816
830 dprintk("--> %s\n", __func__); 817 dprintk("--> %s\n", __func__);
@@ -834,40 +821,31 @@ send_layoutget(struct pnfs_layout_hdr *lo,
834 * store in lseg. If we race with a concurrent seqid morphing 821 * store in lseg. If we race with a concurrent seqid morphing
835 * op, then re-send the LAYOUTGET. 822 * op, then re-send the LAYOUTGET.
836 */ 823 */
837 do { 824 lgp = kzalloc(sizeof(*lgp), gfp_flags);
838 lgp = kzalloc(sizeof(*lgp), gfp_flags); 825 if (lgp == NULL)
839 if (lgp == NULL) 826 return ERR_PTR(-ENOMEM);
840 return NULL;
841
842 i_size = i_size_read(ino);
843
844 lgp->args.minlength = PAGE_SIZE;
845 if (lgp->args.minlength > range->length)
846 lgp->args.minlength = range->length;
847 if (range->iomode == IOMODE_READ) {
848 if (range->offset >= i_size)
849 lgp->args.minlength = 0;
850 else if (i_size - range->offset < lgp->args.minlength)
851 lgp->args.minlength = i_size - range->offset;
852 }
853 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
854 pnfs_copy_range(&lgp->args.range, range);
855 lgp->args.type = server->pnfs_curr_ld->id;
856 lgp->args.inode = ino;
857 lgp->args.ctx = get_nfs_open_context(ctx);
858 lgp->gfp_flags = gfp_flags;
859 lgp->cred = lo->plh_lc_cred;
860
861 lseg = nfs4_proc_layoutget(lgp, gfp_flags);
862 } while (lseg == ERR_PTR(-EAGAIN));
863
864 if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
865 lseg = NULL;
866 else
867 pnfs_layout_clear_fail_bit(lo,
868 pnfs_iomode_to_fail_bit(range->iomode));
869 827
870 return lseg; 828 i_size = i_size_read(ino);
829
830 lgp->args.minlength = PAGE_SIZE;
831 if (lgp->args.minlength > range->length)
832 lgp->args.minlength = range->length;
833 if (range->iomode == IOMODE_READ) {
834 if (range->offset >= i_size)
835 lgp->args.minlength = 0;
836 else if (i_size - range->offset < lgp->args.minlength)
837 lgp->args.minlength = i_size - range->offset;
838 }
839 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
840 pnfs_copy_range(&lgp->args.range, range);
841 lgp->args.type = server->pnfs_curr_ld->id;
842 lgp->args.inode = ino;
843 lgp->args.ctx = get_nfs_open_context(ctx);
844 nfs4_stateid_copy(&lgp->args.stateid, stateid);
845 lgp->gfp_flags = gfp_flags;
846 lgp->cred = lo->plh_lc_cred;
847
848 return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
871} 849}
872 850
873static void pnfs_clear_layoutcommit(struct inode *inode, 851static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -899,6 +877,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
899 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) 877 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
900 return false; 878 return false;
901 lo->plh_return_iomode = 0; 879 lo->plh_return_iomode = 0;
880 lo->plh_return_seq = 0;
902 pnfs_get_layout_hdr(lo); 881 pnfs_get_layout_hdr(lo);
903 clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 882 clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
904 return true; 883 return true;
@@ -969,6 +948,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
969 bool send; 948 bool send;
970 949
971 nfs4_stateid_copy(&stateid, &lo->plh_stateid); 950 nfs4_stateid_copy(&stateid, &lo->plh_stateid);
951 stateid.seqid = cpu_to_be32(lo->plh_return_seq);
972 iomode = lo->plh_return_iomode; 952 iomode = lo->plh_return_iomode;
973 send = pnfs_prepare_layoutreturn(lo); 953 send = pnfs_prepare_layoutreturn(lo);
974 spin_unlock(&inode->i_lock); 954 spin_unlock(&inode->i_lock);
@@ -1012,7 +992,7 @@ _pnfs_return_layout(struct inode *ino)
1012 pnfs_get_layout_hdr(lo); 992 pnfs_get_layout_hdr(lo);
1013 empty = list_empty(&lo->plh_segs); 993 empty = list_empty(&lo->plh_segs);
1014 pnfs_clear_layoutcommit(ino, &tmp_list); 994 pnfs_clear_layoutcommit(ino, &tmp_list);
1015 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 995 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
1016 996
1017 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { 997 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
1018 struct pnfs_layout_range range = { 998 struct pnfs_layout_range range = {
@@ -1341,23 +1321,28 @@ out_existing:
1341 1321
1342/* 1322/*
1343 * iomode matching rules: 1323 * iomode matching rules:
1344 * iomode lseg match 1324 * iomode lseg strict match
1345 * ----- ----- ----- 1325 * iomode
1346 * ANY READ true 1326 * ----- ----- ------ -----
1347 * ANY RW true 1327 * ANY READ N/A true
1348 * RW READ false 1328 * ANY RW N/A true
1349 * RW RW true 1329 * RW READ N/A false
1350 * READ READ true 1330 * RW RW N/A true
1351 * READ RW true 1331 * READ READ N/A true
1332 * READ RW true false
1333 * READ RW false true
1352 */ 1334 */
1353static bool 1335static bool
1354pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, 1336pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1355 const struct pnfs_layout_range *range) 1337 const struct pnfs_layout_range *range,
1338 bool strict_iomode)
1356{ 1339{
1357 struct pnfs_layout_range range1; 1340 struct pnfs_layout_range range1;
1358 1341
1359 if ((range->iomode == IOMODE_RW && 1342 if ((range->iomode == IOMODE_RW &&
1360 ls_range->iomode != IOMODE_RW) || 1343 ls_range->iomode != IOMODE_RW) ||
1344 (range->iomode != ls_range->iomode &&
1345 strict_iomode == true) ||
1361 !pnfs_lseg_range_intersecting(ls_range, range)) 1346 !pnfs_lseg_range_intersecting(ls_range, range))
1362 return 0; 1347 return 0;
1363 1348
@@ -1372,7 +1357,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1372 */ 1357 */
1373static struct pnfs_layout_segment * 1358static struct pnfs_layout_segment *
1374pnfs_find_lseg(struct pnfs_layout_hdr *lo, 1359pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1375 struct pnfs_layout_range *range) 1360 struct pnfs_layout_range *range,
1361 bool strict_iomode)
1376{ 1362{
1377 struct pnfs_layout_segment *lseg, *ret = NULL; 1363 struct pnfs_layout_segment *lseg, *ret = NULL;
1378 1364
@@ -1381,7 +1367,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1381 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1367 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1382 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1368 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1383 !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && 1369 !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1384 pnfs_lseg_range_match(&lseg->pls_range, range)) { 1370 pnfs_lseg_range_match(&lseg->pls_range, range,
1371 strict_iomode)) {
1385 ret = pnfs_get_lseg(lseg); 1372 ret = pnfs_get_lseg(lseg);
1386 break; 1373 break;
1387 } 1374 }
@@ -1498,6 +1485,7 @@ pnfs_update_layout(struct inode *ino,
1498 loff_t pos, 1485 loff_t pos,
1499 u64 count, 1486 u64 count,
1500 enum pnfs_iomode iomode, 1487 enum pnfs_iomode iomode,
1488 bool strict_iomode,
1501 gfp_t gfp_flags) 1489 gfp_t gfp_flags)
1502{ 1490{
1503 struct pnfs_layout_range arg = { 1491 struct pnfs_layout_range arg = {
@@ -1505,27 +1493,30 @@ pnfs_update_layout(struct inode *ino,
1505 .offset = pos, 1493 .offset = pos,
1506 .length = count, 1494 .length = count,
1507 }; 1495 };
1508 unsigned pg_offset; 1496 unsigned pg_offset, seq;
1509 struct nfs_server *server = NFS_SERVER(ino); 1497 struct nfs_server *server = NFS_SERVER(ino);
1510 struct nfs_client *clp = server->nfs_client; 1498 struct nfs_client *clp = server->nfs_client;
1511 struct pnfs_layout_hdr *lo; 1499 struct pnfs_layout_hdr *lo = NULL;
1512 struct pnfs_layout_segment *lseg = NULL; 1500 struct pnfs_layout_segment *lseg = NULL;
1501 nfs4_stateid stateid;
1502 long timeout = 0;
1503 unsigned long giveup = jiffies + rpc_get_timeout(server->client);
1513 bool first; 1504 bool first;
1514 1505
1515 if (!pnfs_enabled_sb(NFS_SERVER(ino))) { 1506 if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1516 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1507 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1517 PNFS_UPDATE_LAYOUT_NO_PNFS); 1508 PNFS_UPDATE_LAYOUT_NO_PNFS);
1518 goto out; 1509 goto out;
1519 } 1510 }
1520 1511
1521 if (iomode == IOMODE_READ && i_size_read(ino) == 0) { 1512 if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
1522 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1513 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1523 PNFS_UPDATE_LAYOUT_RD_ZEROLEN); 1514 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
1524 goto out; 1515 goto out;
1525 } 1516 }
1526 1517
1527 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { 1518 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1528 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1519 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1529 PNFS_UPDATE_LAYOUT_MDSTHRESH); 1520 PNFS_UPDATE_LAYOUT_MDSTHRESH);
1530 goto out; 1521 goto out;
1531 } 1522 }
@@ -1536,14 +1527,14 @@ lookup_again:
1536 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1527 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1537 if (lo == NULL) { 1528 if (lo == NULL) {
1538 spin_unlock(&ino->i_lock); 1529 spin_unlock(&ino->i_lock);
1539 trace_pnfs_update_layout(ino, pos, count, iomode, NULL, 1530 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1540 PNFS_UPDATE_LAYOUT_NOMEM); 1531 PNFS_UPDATE_LAYOUT_NOMEM);
1541 goto out; 1532 goto out;
1542 } 1533 }
1543 1534
1544 /* Do we even need to bother with this? */ 1535 /* Do we even need to bother with this? */
1545 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1536 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1546 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1537 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1547 PNFS_UPDATE_LAYOUT_BULK_RECALL); 1538 PNFS_UPDATE_LAYOUT_BULK_RECALL);
1548 dprintk("%s matches recall, use MDS\n", __func__); 1539 dprintk("%s matches recall, use MDS\n", __func__);
1549 goto out_unlock; 1540 goto out_unlock;
@@ -1551,14 +1542,34 @@ lookup_again:
1551 1542
1552 /* if LAYOUTGET already failed once we don't try again */ 1543 /* if LAYOUTGET already failed once we don't try again */
1553 if (pnfs_layout_io_test_failed(lo, iomode)) { 1544 if (pnfs_layout_io_test_failed(lo, iomode)) {
1554 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1545 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1555 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); 1546 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1556 goto out_unlock; 1547 goto out_unlock;
1557 } 1548 }
1558 1549
1559 first = list_empty(&lo->plh_segs); 1550 lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
1560 if (first) { 1551 if (lseg) {
1561 /* The first layoutget for the file. Need to serialize per 1552 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1553 PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1554 goto out_unlock;
1555 }
1556
1557 if (!nfs4_valid_open_stateid(ctx->state)) {
1558 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1559 PNFS_UPDATE_LAYOUT_INVALID_OPEN);
1560 goto out_unlock;
1561 }
1562
1563 /*
1564 * Choose a stateid for the LAYOUTGET. If we don't have a layout
1565 * stateid, or it has been invalidated, then we must use the open
1566 * stateid.
1567 */
1568 if (lo->plh_stateid.seqid == 0 ||
1569 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
1570
1571 /*
1572 * The first layoutget for the file. Need to serialize per
1562 * RFC 5661 Errata 3208. 1573 * RFC 5661 Errata 3208.
1563 */ 1574 */
1564 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, 1575 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1567,18 +1578,17 @@ lookup_again:
1567 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, 1578 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1568 TASK_UNINTERRUPTIBLE); 1579 TASK_UNINTERRUPTIBLE);
1569 pnfs_put_layout_hdr(lo); 1580 pnfs_put_layout_hdr(lo);
1581 dprintk("%s retrying\n", __func__);
1570 goto lookup_again; 1582 goto lookup_again;
1571 } 1583 }
1584
1585 first = true;
1586 do {
1587 seq = read_seqbegin(&ctx->state->seqlock);
1588 nfs4_stateid_copy(&stateid, &ctx->state->stateid);
1589 } while (read_seqretry(&ctx->state->seqlock, seq));
1572 } else { 1590 } else {
1573 /* Check to see if the layout for the given range 1591 nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1574 * already exists
1575 */
1576 lseg = pnfs_find_lseg(lo, &arg);
1577 if (lseg) {
1578 trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1579 PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1580 goto out_unlock;
1581 }
1582 } 1592 }
1583 1593
1584 /* 1594 /*
@@ -1593,15 +1603,17 @@ lookup_again:
1593 pnfs_clear_first_layoutget(lo); 1603 pnfs_clear_first_layoutget(lo);
1594 pnfs_put_layout_hdr(lo); 1604 pnfs_put_layout_hdr(lo);
1595 dprintk("%s retrying\n", __func__); 1605 dprintk("%s retrying\n", __func__);
1606 trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1607 lseg, PNFS_UPDATE_LAYOUT_RETRY);
1596 goto lookup_again; 1608 goto lookup_again;
1597 } 1609 }
1598 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1610 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1599 PNFS_UPDATE_LAYOUT_RETURN); 1611 PNFS_UPDATE_LAYOUT_RETURN);
1600 goto out_put_layout_hdr; 1612 goto out_put_layout_hdr;
1601 } 1613 }
1602 1614
1603 if (pnfs_layoutgets_blocked(lo)) { 1615 if (pnfs_layoutgets_blocked(lo)) {
1604 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 1616 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1605 PNFS_UPDATE_LAYOUT_BLOCKED); 1617 PNFS_UPDATE_LAYOUT_BLOCKED);
1606 goto out_unlock; 1618 goto out_unlock;
1607 } 1619 }
@@ -1626,10 +1638,36 @@ lookup_again:
1626 if (arg.length != NFS4_MAX_UINT64) 1638 if (arg.length != NFS4_MAX_UINT64)
1627 arg.length = PAGE_ALIGN(arg.length); 1639 arg.length = PAGE_ALIGN(arg.length);
1628 1640
1629 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1641 lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
1630 atomic_dec(&lo->plh_outstanding); 1642 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1631 trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1632 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); 1643 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1644 if (IS_ERR(lseg)) {
1645 switch(PTR_ERR(lseg)) {
1646 case -ERECALLCONFLICT:
1647 if (time_after(jiffies, giveup))
1648 lseg = NULL;
1649 /* Fallthrough */
1650 case -EAGAIN:
1651 pnfs_put_layout_hdr(lo);
1652 if (first)
1653 pnfs_clear_first_layoutget(lo);
1654 if (lseg) {
1655 trace_pnfs_update_layout(ino, pos, count,
1656 iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
1657 goto lookup_again;
1658 }
1659 /* Fallthrough */
1660 default:
1661 if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
1662 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1663 lseg = NULL;
1664 }
1665 }
1666 } else {
1667 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1668 }
1669
1670 atomic_dec(&lo->plh_outstanding);
1633out_put_layout_hdr: 1671out_put_layout_hdr:
1634 if (first) 1672 if (first)
1635 pnfs_clear_first_layoutget(lo); 1673 pnfs_clear_first_layoutget(lo);
@@ -1678,38 +1716,36 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1678 struct pnfs_layout_segment *lseg; 1716 struct pnfs_layout_segment *lseg;
1679 struct inode *ino = lo->plh_inode; 1717 struct inode *ino = lo->plh_inode;
1680 LIST_HEAD(free_me); 1718 LIST_HEAD(free_me);
1681 int status = -EINVAL;
1682 1719
1683 if (!pnfs_sanity_check_layout_range(&res->range)) 1720 if (!pnfs_sanity_check_layout_range(&res->range))
1684 goto out; 1721 return ERR_PTR(-EINVAL);
1685 1722
1686 /* Inject layout blob into I/O device driver */ 1723 /* Inject layout blob into I/O device driver */
1687 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1724 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1688 if (!lseg || IS_ERR(lseg)) { 1725 if (IS_ERR_OR_NULL(lseg)) {
1689 if (!lseg) 1726 if (!lseg)
1690 status = -ENOMEM; 1727 lseg = ERR_PTR(-ENOMEM);
1691 else 1728
1692 status = PTR_ERR(lseg); 1729 dprintk("%s: Could not allocate layout: error %ld\n",
1693 dprintk("%s: Could not allocate layout: error %d\n", 1730 __func__, PTR_ERR(lseg));
1694 __func__, status); 1731 return lseg;
1695 goto out;
1696 } 1732 }
1697 1733
1698 init_lseg(lo, lseg); 1734 init_lseg(lo, lseg);
1699 lseg->pls_range = res->range; 1735 lseg->pls_range = res->range;
1736 lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
1700 1737
1701 spin_lock(&ino->i_lock); 1738 spin_lock(&ino->i_lock);
1702 if (pnfs_layoutgets_blocked(lo)) { 1739 if (pnfs_layoutgets_blocked(lo)) {
1703 dprintk("%s forget reply due to state\n", __func__); 1740 dprintk("%s forget reply due to state\n", __func__);
1704 goto out_forget_reply; 1741 goto out_forget;
1705 } 1742 }
1706 1743
1707 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { 1744 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1708 /* existing state ID, make sure the sequence number matches. */ 1745 /* existing state ID, make sure the sequence number matches. */
1709 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { 1746 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1710 dprintk("%s forget reply due to sequence\n", __func__); 1747 dprintk("%s forget reply due to sequence\n", __func__);
1711 status = -EAGAIN; 1748 goto out_forget;
1712 goto out_forget_reply;
1713 } 1749 }
1714 pnfs_set_layout_stateid(lo, &res->stateid, false); 1750 pnfs_set_layout_stateid(lo, &res->stateid, false);
1715 } else { 1751 } else {
@@ -1718,7 +1754,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1718 * inode invalid, and don't bother validating the stateid 1754 * inode invalid, and don't bother validating the stateid
1719 * sequence number. 1755 * sequence number.
1720 */ 1756 */
1721 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); 1757 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
1722 1758
1723 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); 1759 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1724 lo->plh_barrier = be32_to_cpu(res->stateid.seqid); 1760 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
@@ -1735,18 +1771,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1735 spin_unlock(&ino->i_lock); 1771 spin_unlock(&ino->i_lock);
1736 pnfs_free_lseg_list(&free_me); 1772 pnfs_free_lseg_list(&free_me);
1737 return lseg; 1773 return lseg;
1738out:
1739 return ERR_PTR(status);
1740 1774
1741out_forget_reply: 1775out_forget:
1742 spin_unlock(&ino->i_lock); 1776 spin_unlock(&ino->i_lock);
1743 lseg->pls_layout = lo; 1777 lseg->pls_layout = lo;
1744 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 1778 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1745 goto out; 1779 return ERR_PTR(-EAGAIN);
1746} 1780}
1747 1781
1748static void 1782static void
1749pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) 1783pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
1784 u32 seq)
1750{ 1785{
1751 if (lo->plh_return_iomode == iomode) 1786 if (lo->plh_return_iomode == iomode)
1752 return; 1787 return;
@@ -1754,6 +1789,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
1754 iomode = IOMODE_ANY; 1789 iomode = IOMODE_ANY;
1755 lo->plh_return_iomode = iomode; 1790 lo->plh_return_iomode = iomode;
1756 set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 1791 set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
1792 if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
1793 lo->plh_return_seq = seq;
1757} 1794}
1758 1795
1759/** 1796/**
@@ -1769,7 +1806,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
1769int 1806int
1770pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, 1807pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1771 struct list_head *tmp_list, 1808 struct list_head *tmp_list,
1772 const struct pnfs_layout_range *return_range) 1809 const struct pnfs_layout_range *return_range,
1810 u32 seq)
1773{ 1811{
1774 struct pnfs_layout_segment *lseg, *next; 1812 struct pnfs_layout_segment *lseg, *next;
1775 int remaining = 0; 1813 int remaining = 0;
@@ -1792,8 +1830,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1792 continue; 1830 continue;
1793 remaining++; 1831 remaining++;
1794 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); 1832 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1795 pnfs_set_plh_return_iomode(lo, return_range->iomode);
1796 } 1833 }
1834
1835 if (remaining)
1836 pnfs_set_plh_return_info(lo, return_range->iomode, seq);
1837
1797 return remaining; 1838 return remaining;
1798} 1839}
1799 1840
@@ -1810,13 +1851,14 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
1810 bool return_now = false; 1851 bool return_now = false;
1811 1852
1812 spin_lock(&inode->i_lock); 1853 spin_lock(&inode->i_lock);
1813 pnfs_set_plh_return_iomode(lo, range.iomode); 1854 pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
1814 /* 1855 /*
1815 * mark all matching lsegs so that we are sure to have no live 1856 * mark all matching lsegs so that we are sure to have no live
1816 * segments at hand when sending layoutreturn. See pnfs_put_lseg() 1857 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1817 * for how it works. 1858 * for how it works.
1818 */ 1859 */
1819 if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { 1860 if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
1861 &range, lseg->pls_seq)) {
1820 nfs4_stateid stateid; 1862 nfs4_stateid stateid;
1821 enum pnfs_iomode iomode = lo->plh_return_iomode; 1863 enum pnfs_iomode iomode = lo->plh_return_iomode;
1822 1864
@@ -1849,6 +1891,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
1849 req_offset(req), 1891 req_offset(req),
1850 rd_size, 1892 rd_size,
1851 IOMODE_READ, 1893 IOMODE_READ,
1894 false,
1852 GFP_KERNEL); 1895 GFP_KERNEL);
1853 if (IS_ERR(pgio->pg_lseg)) { 1896 if (IS_ERR(pgio->pg_lseg)) {
1854 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 1897 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1873,6 +1916,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1873 req_offset(req), 1916 req_offset(req),
1874 wb_size, 1917 wb_size,
1875 IOMODE_RW, 1918 IOMODE_RW,
1919 false,
1876 GFP_NOFS); 1920 GFP_NOFS);
1877 if (IS_ERR(pgio->pg_lseg)) { 1921 if (IS_ERR(pgio->pg_lseg)) {
1878 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 1922 pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -2143,12 +2187,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2143} 2187}
2144 2188
2145/* Resend all requests through pnfs. */ 2189/* Resend all requests through pnfs. */
2146int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) 2190void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2147{ 2191{
2148 struct nfs_pageio_descriptor pgio; 2192 struct nfs_pageio_descriptor pgio;
2149 2193
2150 nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); 2194 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2151 return nfs_pageio_resend(&pgio, hdr); 2195 nfs_pageio_init_read(&pgio, hdr->inode, false,
2196 hdr->completion_ops);
2197 hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
2198 }
2152} 2199}
2153EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); 2200EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2154 2201
@@ -2158,12 +2205,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2158 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 2205 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2159 struct pnfs_layout_segment *lseg = desc->pg_lseg; 2206 struct pnfs_layout_segment *lseg = desc->pg_lseg;
2160 enum pnfs_try_status trypnfs; 2207 enum pnfs_try_status trypnfs;
2161 int err = 0;
2162 2208
2163 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); 2209 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2164 if (trypnfs == PNFS_TRY_AGAIN) 2210 if (trypnfs == PNFS_TRY_AGAIN)
2165 err = pnfs_read_resend_pnfs(hdr); 2211 pnfs_read_resend_pnfs(hdr);
2166 if (trypnfs == PNFS_NOT_ATTEMPTED || err) 2212 if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
2167 pnfs_read_through_mds(desc, hdr); 2213 pnfs_read_through_mds(desc, hdr);
2168} 2214}
2169 2215
@@ -2405,7 +2451,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2405 spin_lock(&inode->i_lock); 2451 spin_lock(&inode->i_lock);
2406 if (!NFS_I(inode)->layout) { 2452 if (!NFS_I(inode)->layout) {
2407 spin_unlock(&inode->i_lock); 2453 spin_unlock(&inode->i_lock);
2408 goto out; 2454 goto out_clear_layoutstats;
2409 } 2455 }
2410 hdr = NFS_I(inode)->layout; 2456 hdr = NFS_I(inode)->layout;
2411 pnfs_get_layout_hdr(hdr); 2457 pnfs_get_layout_hdr(hdr);
@@ -2434,6 +2480,7 @@ out_free:
2434 kfree(data); 2480 kfree(data);
2435out_put: 2481out_put:
2436 pnfs_put_layout_hdr(hdr); 2482 pnfs_put_layout_hdr(hdr);
2483out_clear_layoutstats:
2437 smp_mb__before_atomic(); 2484 smp_mb__before_atomic();
2438 clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); 2485 clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
2439 smp_mb__after_atomic(); 2486 smp_mb__after_atomic();
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1ac1db5f6dad..b21bd0bee784 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -64,6 +64,7 @@ struct pnfs_layout_segment {
64 struct list_head pls_lc_list; 64 struct list_head pls_lc_list;
65 struct pnfs_layout_range pls_range; 65 struct pnfs_layout_range pls_range;
66 atomic_t pls_refcount; 66 atomic_t pls_refcount;
67 u32 pls_seq;
67 unsigned long pls_flags; 68 unsigned long pls_flags;
68 struct pnfs_layout_hdr *pls_layout; 69 struct pnfs_layout_hdr *pls_layout;
69 struct work_struct pls_work; 70 struct work_struct pls_work;
@@ -194,6 +195,7 @@ struct pnfs_layout_hdr {
194 unsigned long plh_flags; 195 unsigned long plh_flags;
195 nfs4_stateid plh_stateid; 196 nfs4_stateid plh_stateid;
196 u32 plh_barrier; /* ignore lower seqids */ 197 u32 plh_barrier; /* ignore lower seqids */
198 u32 plh_return_seq;
197 enum pnfs_iomode plh_return_iomode; 199 enum pnfs_iomode plh_return_iomode;
198 loff_t plh_lwb; /* last write byte for layoutcommit */ 200 loff_t plh_lwb; /* last write byte for layoutcommit */
199 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 201 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
226extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 228extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
227 struct pnfs_device *dev, 229 struct pnfs_device *dev,
228 struct rpc_cred *cred); 230 struct rpc_cred *cred);
229extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 231extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
230extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); 232extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
231 233
232/* pnfs.c */ 234/* pnfs.c */
@@ -258,16 +260,14 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
258void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, 260void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
259 const nfs4_stateid *new, 261 const nfs4_stateid *new,
260 bool update_barrier); 262 bool update_barrier);
261int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
262 struct pnfs_layout_hdr *lo,
263 const struct pnfs_layout_range *range,
264 struct nfs4_state *open_state);
265int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 263int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
266 struct list_head *tmp_list, 264 struct list_head *tmp_list,
267 const struct pnfs_layout_range *recall_range); 265 const struct pnfs_layout_range *recall_range,
266 u32 seq);
268int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, 267int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
269 struct list_head *tmp_list, 268 struct list_head *tmp_list,
270 const struct pnfs_layout_range *recall_range); 269 const struct pnfs_layout_range *recall_range,
270 u32 seq);
271bool pnfs_roc(struct inode *ino); 271bool pnfs_roc(struct inode *ino);
272void pnfs_roc_release(struct inode *ino); 272void pnfs_roc_release(struct inode *ino);
273void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 273void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -282,12 +282,13 @@ int _pnfs_return_layout(struct inode *);
282int pnfs_commit_and_return_layout(struct inode *); 282int pnfs_commit_and_return_layout(struct inode *);
283void pnfs_ld_write_done(struct nfs_pgio_header *); 283void pnfs_ld_write_done(struct nfs_pgio_header *);
284void pnfs_ld_read_done(struct nfs_pgio_header *); 284void pnfs_ld_read_done(struct nfs_pgio_header *);
285int pnfs_read_resend_pnfs(struct nfs_pgio_header *); 285void pnfs_read_resend_pnfs(struct nfs_pgio_header *);
286struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 286struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
287 struct nfs_open_context *ctx, 287 struct nfs_open_context *ctx,
288 loff_t pos, 288 loff_t pos,
289 u64 count, 289 u64 count,
290 enum pnfs_iomode iomode, 290 enum pnfs_iomode iomode,
291 bool strict_iomode,
291 gfp_t gfp_flags); 292 gfp_t gfp_flags);
292void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); 293void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
293 294
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 4aaed890048f..0dfc476da3e1 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
61 61
62/* The generic layer is about to remove the req from the commit list. 62/* The generic layer is about to remove the req from the commit list.
63 * If this will make the bucket empty, it will need to put the lseg reference. 63 * If this will make the bucket empty, it will need to put the lseg reference.
64 * Note this must be called holding the inode (/cinfo) lock 64 * Note this must be called holding i_lock
65 */ 65 */
66void 66void
67pnfs_generic_clear_request_commit(struct nfs_page *req, 67pnfs_generic_clear_request_commit(struct nfs_page *req,
@@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
98 if (!nfs_lock_request(req)) 98 if (!nfs_lock_request(req))
99 continue; 99 continue;
100 kref_get(&req->wb_kref); 100 kref_get(&req->wb_kref);
101 if (cond_resched_lock(cinfo->lock)) 101 if (cond_resched_lock(&cinfo->inode->i_lock))
102 list_safe_reset_next(req, tmp, wb_list); 102 list_safe_reset_next(req, tmp, wb_list);
103 nfs_request_remove_commit_list(req, cinfo); 103 nfs_request_remove_commit_list(req, cinfo);
104 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); 104 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
119 struct list_head *dst = &bucket->committing; 119 struct list_head *dst = &bucket->committing;
120 int ret; 120 int ret;
121 121
122 lockdep_assert_held(cinfo->lock); 122 lockdep_assert_held(&cinfo->inode->i_lock);
123 ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); 123 ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
124 if (ret) { 124 if (ret) {
125 cinfo->ds->nwritten -= ret; 125 cinfo->ds->nwritten -= ret;
@@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
142{ 142{
143 int i, rv = 0, cnt; 143 int i, rv = 0, cnt;
144 144
145 lockdep_assert_held(cinfo->lock); 145 lockdep_assert_held(&cinfo->inode->i_lock);
146 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { 146 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
147 cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], 147 cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
148 cinfo, max); 148 cinfo, max);
@@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
161 struct pnfs_layout_segment *freeme; 161 struct pnfs_layout_segment *freeme;
162 int i; 162 int i;
163 163
164 lockdep_assert_held(cinfo->lock); 164 lockdep_assert_held(&cinfo->inode->i_lock);
165restart: 165restart:
166 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { 166 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
167 if (pnfs_generic_transfer_commit_list(&b->written, dst, 167 if (pnfs_generic_transfer_commit_list(&b->written, dst,
168 cinfo, 0)) { 168 cinfo, 0)) {
169 freeme = b->wlseg; 169 freeme = b->wlseg;
170 b->wlseg = NULL; 170 b->wlseg = NULL;
171 spin_unlock(cinfo->lock); 171 spin_unlock(&cinfo->inode->i_lock);
172 pnfs_put_lseg(freeme); 172 pnfs_put_lseg(freeme);
173 spin_lock(cinfo->lock); 173 spin_lock(&cinfo->inode->i_lock);
174 goto restart; 174 goto restart;
175 } 175 }
176 } 176 }
@@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
186 LIST_HEAD(pages); 186 LIST_HEAD(pages);
187 int i; 187 int i;
188 188
189 spin_lock(cinfo->lock); 189 spin_lock(&cinfo->inode->i_lock);
190 for (i = idx; i < fl_cinfo->nbuckets; i++) { 190 for (i = idx; i < fl_cinfo->nbuckets; i++) {
191 bucket = &fl_cinfo->buckets[i]; 191 bucket = &fl_cinfo->buckets[i];
192 if (list_empty(&bucket->committing)) 192 if (list_empty(&bucket->committing))
@@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
194 freeme = bucket->clseg; 194 freeme = bucket->clseg;
195 bucket->clseg = NULL; 195 bucket->clseg = NULL;
196 list_splice_init(&bucket->committing, &pages); 196 list_splice_init(&bucket->committing, &pages);
197 spin_unlock(cinfo->lock); 197 spin_unlock(&cinfo->inode->i_lock);
198 nfs_retry_commit(&pages, freeme, cinfo, i); 198 nfs_retry_commit(&pages, freeme, cinfo, i);
199 pnfs_put_lseg(freeme); 199 pnfs_put_lseg(freeme);
200 spin_lock(cinfo->lock); 200 spin_lock(&cinfo->inode->i_lock);
201 } 201 }
202 spin_unlock(cinfo->lock); 202 spin_unlock(&cinfo->inode->i_lock);
203} 203}
204 204
205static unsigned int 205static unsigned int
@@ -238,14 +238,31 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
238 struct pnfs_commit_bucket *bucket; 238 struct pnfs_commit_bucket *bucket;
239 239
240 bucket = &cinfo->ds->buckets[data->ds_commit_index]; 240 bucket = &cinfo->ds->buckets[data->ds_commit_index];
241 spin_lock(cinfo->lock); 241 spin_lock(&cinfo->inode->i_lock);
242 list_splice_init(&bucket->committing, pages); 242 list_splice_init(&bucket->committing, pages);
243 data->lseg = bucket->clseg; 243 data->lseg = bucket->clseg;
244 bucket->clseg = NULL; 244 bucket->clseg = NULL;
245 spin_unlock(cinfo->lock); 245 spin_unlock(&cinfo->inode->i_lock);
246 246
247} 247}
248 248
249/* Helper function for pnfs_generic_commit_pagelist to catch an empty
250 * page list. This can happen when two commits race. */
251static bool
252pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
253 struct nfs_commit_data *data,
254 struct nfs_commit_info *cinfo)
255{
256 if (list_empty(pages)) {
257 if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
258 wake_up_atomic_t(&cinfo->mds->rpcs_out);
259 nfs_commitdata_release(data);
260 return true;
261 }
262
263 return false;
264}
265
249/* This follows nfs_commit_list pretty closely */ 266/* This follows nfs_commit_list pretty closely */
250int 267int
251pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, 268pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -280,6 +297,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
280 list_for_each_entry_safe(data, tmp, &list, pages) { 297 list_for_each_entry_safe(data, tmp, &list, pages) {
281 list_del_init(&data->pages); 298 list_del_init(&data->pages);
282 if (data->ds_commit_index < 0) { 299 if (data->ds_commit_index < 0) {
300 /* another commit raced with us */
301 if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
302 data, cinfo))
303 continue;
304
283 nfs_init_commit(data, mds_pages, NULL, cinfo); 305 nfs_init_commit(data, mds_pages, NULL, cinfo);
284 nfs_initiate_commit(NFS_CLIENT(inode), data, 306 nfs_initiate_commit(NFS_CLIENT(inode), data,
285 NFS_PROTO(data->inode), 307 NFS_PROTO(data->inode),
@@ -288,6 +310,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
288 LIST_HEAD(pages); 310 LIST_HEAD(pages);
289 311
290 pnfs_fetch_commit_bucket_list(&pages, data, cinfo); 312 pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
313
314 /* another commit raced with us */
315 if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
316 data, cinfo))
317 continue;
318
291 nfs_init_commit(data, &pages, data->lseg, cinfo); 319 nfs_init_commit(data, &pages, data->lseg, cinfo);
292 initiate_commit(data, how); 320 initiate_commit(data, how);
293 } 321 }
@@ -874,12 +902,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
874 struct list_head *list; 902 struct list_head *list;
875 struct pnfs_commit_bucket *buckets; 903 struct pnfs_commit_bucket *buckets;
876 904
877 spin_lock(cinfo->lock); 905 spin_lock(&cinfo->inode->i_lock);
878 buckets = cinfo->ds->buckets; 906 buckets = cinfo->ds->buckets;
879 list = &buckets[ds_commit_idx].written; 907 list = &buckets[ds_commit_idx].written;
880 if (list_empty(list)) { 908 if (list_empty(list)) {
881 if (!pnfs_is_valid_lseg(lseg)) { 909 if (!pnfs_is_valid_lseg(lseg)) {
882 spin_unlock(cinfo->lock); 910 spin_unlock(&cinfo->inode->i_lock);
883 cinfo->completion_ops->resched_write(cinfo, req); 911 cinfo->completion_ops->resched_write(cinfo, req);
884 return; 912 return;
885 } 913 }
@@ -896,7 +924,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
896 cinfo->ds->nwritten++; 924 cinfo->ds->nwritten++;
897 925
898 nfs_request_add_commit_list_locked(req, list, cinfo); 926 nfs_request_add_commit_list_locked(req, list, cinfo);
899 spin_unlock(cinfo->lock); 927 spin_unlock(&cinfo->inode->i_lock);
900 nfs_mark_page_unstable(req->wb_page, cinfo); 928 nfs_mark_page_unstable(req->wb_page, cinfo);
901} 929}
902EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); 930EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1268280244e..2137e0202f25 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = {
191 191
192enum { 192enum {
193 Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, 193 Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
194 Opt_xprt_rdma6,
194 195
195 Opt_xprt_err 196 Opt_xprt_err
196}; 197};
@@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = {
201 { Opt_xprt_tcp, "tcp" }, 202 { Opt_xprt_tcp, "tcp" },
202 { Opt_xprt_tcp6, "tcp6" }, 203 { Opt_xprt_tcp6, "tcp6" },
203 { Opt_xprt_rdma, "rdma" }, 204 { Opt_xprt_rdma, "rdma" },
205 { Opt_xprt_rdma6, "rdma6" },
204 206
205 { Opt_xprt_err, NULL } 207 { Opt_xprt_err, NULL }
206}; 208};
@@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw,
1456 mnt->flags |= NFS_MOUNT_TCP; 1458 mnt->flags |= NFS_MOUNT_TCP;
1457 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1459 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1458 break; 1460 break;
1461 case Opt_xprt_rdma6:
1462 protofamily = AF_INET6;
1459 case Opt_xprt_rdma: 1463 case Opt_xprt_rdma:
1460 /* vector side protocols to TCP */ 1464 /* vector side protocols to TCP */
1461 mnt->flags |= NFS_MOUNT_TCP; 1465 mnt->flags |= NFS_MOUNT_TCP;
@@ -2408,6 +2412,11 @@ static int nfs_compare_super_address(struct nfs_server *server1,
2408 struct nfs_server *server2) 2412 struct nfs_server *server2)
2409{ 2413{
2410 struct sockaddr *sap1, *sap2; 2414 struct sockaddr *sap1, *sap2;
2415 struct rpc_xprt *xprt1 = server1->client->cl_xprt;
2416 struct rpc_xprt *xprt2 = server2->client->cl_xprt;
2417
2418 if (!net_eq(xprt1->xprt_net, xprt2->xprt_net))
2419 return 0;
2411 2420
2412 sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; 2421 sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
2413 sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; 2422 sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5f4fd53e5764..e1c74d3db64d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req)
245static int wb_priority(struct writeback_control *wbc) 245static int wb_priority(struct writeback_control *wbc)
246{ 246{
247 int ret = 0; 247 int ret = 0;
248 if (wbc->for_reclaim) 248
249 return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
250 if (wbc->sync_mode == WB_SYNC_ALL) 249 if (wbc->sync_mode == WB_SYNC_ALL)
251 ret = FLUSH_COND_STABLE; 250 ret = FLUSH_COND_STABLE;
252 return ret; 251 return ret;
@@ -737,7 +736,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
737 head = req->wb_head; 736 head = req->wb_head;
738 737
739 spin_lock(&inode->i_lock); 738 spin_lock(&inode->i_lock);
740 if (likely(!PageSwapCache(head->wb_page))) { 739 if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
741 set_page_private(head->wb_page, 0); 740 set_page_private(head->wb_page, 0);
742 ClearPagePrivate(head->wb_page); 741 ClearPagePrivate(head->wb_page);
743 smp_mb__after_atomic(); 742 smp_mb__after_atomic();
@@ -759,7 +758,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
759static void 758static void
760nfs_mark_request_dirty(struct nfs_page *req) 759nfs_mark_request_dirty(struct nfs_page *req)
761{ 760{
762 __set_page_dirty_nobuffers(req->wb_page); 761 if (req->wb_page)
762 __set_page_dirty_nobuffers(req->wb_page);
763} 763}
764 764
765/* 765/*
@@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
804 * number of outstanding requests requiring a commit as well as 804 * number of outstanding requests requiring a commit as well as
805 * the MM page stats. 805 * the MM page stats.
806 * 806 *
807 * The caller must hold the cinfo->lock, and the nfs_page lock. 807 * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
808 */ 808 */
809void 809void
810nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, 810nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -832,10 +832,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
832void 832void
833nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) 833nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
834{ 834{
835 spin_lock(cinfo->lock); 835 spin_lock(&cinfo->inode->i_lock);
836 nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); 836 nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
837 spin_unlock(cinfo->lock); 837 spin_unlock(&cinfo->inode->i_lock);
838 nfs_mark_page_unstable(req->wb_page, cinfo); 838 if (req->wb_page)
839 nfs_mark_page_unstable(req->wb_page, cinfo);
839} 840}
840EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); 841EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
841 842
@@ -864,7 +865,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
864static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, 865static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
865 struct inode *inode) 866 struct inode *inode)
866{ 867{
867 cinfo->lock = &inode->i_lock; 868 cinfo->inode = inode;
868 cinfo->mds = &NFS_I(inode)->commit_info; 869 cinfo->mds = &NFS_I(inode)->commit_info;
869 cinfo->ds = pnfs_get_ds_info(inode); 870 cinfo->ds = pnfs_get_ds_info(inode);
870 cinfo->dreq = NULL; 871 cinfo->dreq = NULL;
@@ -967,7 +968,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
967 return cinfo->mds->ncommit; 968 return cinfo->mds->ncommit;
968} 969}
969 970
970/* cinfo->lock held by caller */ 971/* cinfo->inode->i_lock held by caller */
971int 972int
972nfs_scan_commit_list(struct list_head *src, struct list_head *dst, 973nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
973 struct nfs_commit_info *cinfo, int max) 974 struct nfs_commit_info *cinfo, int max)
@@ -979,7 +980,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
979 if (!nfs_lock_request(req)) 980 if (!nfs_lock_request(req))
980 continue; 981 continue;
981 kref_get(&req->wb_kref); 982 kref_get(&req->wb_kref);
982 if (cond_resched_lock(cinfo->lock)) 983 if (cond_resched_lock(&cinfo->inode->i_lock))
983 list_safe_reset_next(req, tmp, wb_list); 984 list_safe_reset_next(req, tmp, wb_list);
984 nfs_request_remove_commit_list(req, cinfo); 985 nfs_request_remove_commit_list(req, cinfo);
985 nfs_list_add_request(req, dst); 986 nfs_list_add_request(req, dst);
@@ -1005,7 +1006,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
1005{ 1006{
1006 int ret = 0; 1007 int ret = 0;
1007 1008
1008 spin_lock(cinfo->lock); 1009 spin_lock(&cinfo->inode->i_lock);
1009 if (cinfo->mds->ncommit > 0) { 1010 if (cinfo->mds->ncommit > 0) {
1010 const int max = INT_MAX; 1011 const int max = INT_MAX;
1011 1012
@@ -1013,7 +1014,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
1013 cinfo, max); 1014 cinfo, max);
1014 ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); 1015 ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
1015 } 1016 }
1016 spin_unlock(cinfo->lock); 1017 spin_unlock(&cinfo->inode->i_lock);
1017 return ret; 1018 return ret;
1018} 1019}
1019 1020
@@ -1709,6 +1710,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
1709{ 1710{
1710 struct nfs_commit_data *data; 1711 struct nfs_commit_data *data;
1711 1712
1713 /* another commit raced with us */
1714 if (list_empty(head))
1715 return 0;
1716
1712 data = nfs_commitdata_alloc(); 1717 data = nfs_commitdata_alloc();
1713 1718
1714 if (!data) 1719 if (!data)
@@ -1724,6 +1729,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
1724 return -ENOMEM; 1729 return -ENOMEM;
1725} 1730}
1726 1731
1732int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
1733{
1734 struct inode *inode = file_inode(file);
1735 struct nfs_open_context *open;
1736 struct nfs_commit_info cinfo;
1737 struct nfs_page *req;
1738 int ret;
1739
1740 open = get_nfs_open_context(nfs_file_open_context(file));
1741 req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
1742 if (IS_ERR(req)) {
1743 ret = PTR_ERR(req);
1744 goto out_put;
1745 }
1746
1747 nfs_init_cinfo_from_inode(&cinfo, inode);
1748
1749 memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
1750 nfs_request_add_commit_list(req, &cinfo);
1751 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1752 if (ret > 0)
1753 ret = 0;
1754
1755 nfs_free_request(req);
1756out_put:
1757 put_nfs_open_context(open);
1758 return ret;
1759}
1760EXPORT_SYMBOL_GPL(nfs_commit_file);
1761
1727/* 1762/*
1728 * COMMIT call returned 1763 * COMMIT call returned
1729 */ 1764 */
@@ -1748,7 +1783,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1748 while (!list_empty(&data->pages)) { 1783 while (!list_empty(&data->pages)) {
1749 req = nfs_list_entry(data->pages.next); 1784 req = nfs_list_entry(data->pages.next);
1750 nfs_list_remove_request(req); 1785 nfs_list_remove_request(req);
1751 nfs_clear_page_commit(req->wb_page); 1786 if (req->wb_page)
1787 nfs_clear_page_commit(req->wb_page);
1752 1788
1753 dprintk("NFS: commit (%s/%llu %d@%lld)", 1789 dprintk("NFS: commit (%s/%llu %d@%lld)",
1754 req->wb_context->dentry->d_sb->s_id, 1790 req->wb_context->dentry->d_sb->s_id,
diff --git a/include/linux/errno.h b/include/linux/errno.h
index 89627b9187f9..7ce9fb1b7d28 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,5 +28,6 @@
28#define EBADTYPE 527 /* Type not supported by server */ 28#define EBADTYPE 527 /* Type not supported by server */
29#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ 29#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
30#define EIOCBQUEUED 529 /* iocb queued, will get completion event */ 30#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
31#define ERECALLCONFLICT 530 /* conflict with recalled state */
31 32
32#endif 33#endif
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 011433478a14..bfed6b367350 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -50,12 +50,27 @@ struct nfs4_label {
50 50
51typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; 51typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
52 52
53struct nfs_stateid4 { 53struct nfs4_stateid_struct {
54 __be32 seqid; 54 union {
55 char other[NFS4_STATEID_OTHER_SIZE]; 55 char data[NFS4_STATEID_SIZE];
56} __attribute__ ((packed)); 56 struct {
57 __be32 seqid;
58 char other[NFS4_STATEID_OTHER_SIZE];
59 } __attribute__ ((packed));
60 };
61
62 enum {
63 NFS4_INVALID_STATEID_TYPE = 0,
64 NFS4_SPECIAL_STATEID_TYPE,
65 NFS4_OPEN_STATEID_TYPE,
66 NFS4_LOCK_STATEID_TYPE,
67 NFS4_DELEGATION_STATEID_TYPE,
68 NFS4_LAYOUT_STATEID_TYPE,
69 NFS4_PNFS_DS_STATEID_TYPE,
70 } type;
71};
57 72
58typedef struct nfs_stateid4 nfs4_stateid; 73typedef struct nfs4_stateid_struct nfs4_stateid;
59 74
60enum nfs_opnum4 { 75enum nfs_opnum4 {
61 OP_ACCESS = 3, 76 OP_ACCESS = 3,
@@ -504,6 +519,7 @@ enum {
504 NFSPROC4_CLNT_DEALLOCATE, 519 NFSPROC4_CLNT_DEALLOCATE,
505 NFSPROC4_CLNT_LAYOUTSTATS, 520 NFSPROC4_CLNT_LAYOUTSTATS,
506 NFSPROC4_CLNT_CLONE, 521 NFSPROC4_CLNT_CLONE,
522 NFSPROC4_CLNT_COPY,
507}; 523};
508 524
509/* nfs41 types */ 525/* nfs41 types */
@@ -621,7 +637,9 @@ enum pnfs_update_layout_reason {
621 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, 637 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL,
622 PNFS_UPDATE_LAYOUT_FOUND_CACHED, 638 PNFS_UPDATE_LAYOUT_FOUND_CACHED,
623 PNFS_UPDATE_LAYOUT_RETURN, 639 PNFS_UPDATE_LAYOUT_RETURN,
640 PNFS_UPDATE_LAYOUT_RETRY,
624 PNFS_UPDATE_LAYOUT_BLOCKED, 641 PNFS_UPDATE_LAYOUT_BLOCKED,
642 PNFS_UPDATE_LAYOUT_INVALID_OPEN,
625 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, 643 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
626}; 644};
627 645
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7fcc13c8cf1f..14a762d2734d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -246,5 +246,6 @@ struct nfs_server {
246#define NFS_CAP_DEALLOCATE (1U << 21) 246#define NFS_CAP_DEALLOCATE (1U << 21)
247#define NFS_CAP_LAYOUTSTATS (1U << 22) 247#define NFS_CAP_LAYOUTSTATS (1U << 22)
248#define NFS_CAP_CLONE (1U << 23) 248#define NFS_CAP_CLONE (1U << 23)
249#define NFS_CAP_COPY (1U << 24)
249 250
250#endif 251#endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ee8491dadbf3..c304a11b5b1a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -233,7 +233,6 @@ struct nfs4_layoutget_args {
233 struct inode *inode; 233 struct inode *inode;
234 struct nfs_open_context *ctx; 234 struct nfs_open_context *ctx;
235 nfs4_stateid stateid; 235 nfs4_stateid stateid;
236 unsigned long timestamp;
237 struct nfs4_layoutdriver_data layout; 236 struct nfs4_layoutdriver_data layout;
238}; 237};
239 238
@@ -251,7 +250,6 @@ struct nfs4_layoutget {
251 struct nfs4_layoutget_res res; 250 struct nfs4_layoutget_res res;
252 struct rpc_cred *cred; 251 struct rpc_cred *cred;
253 gfp_t gfp_flags; 252 gfp_t gfp_flags;
254 long timeout;
255}; 253};
256 254
257struct nfs4_getdeviceinfo_args { 255struct nfs4_getdeviceinfo_args {
@@ -1343,6 +1341,32 @@ struct nfs42_falloc_res {
1343 const struct nfs_server *falloc_server; 1341 const struct nfs_server *falloc_server;
1344}; 1342};
1345 1343
1344struct nfs42_copy_args {
1345 struct nfs4_sequence_args seq_args;
1346
1347 struct nfs_fh *src_fh;
1348 nfs4_stateid src_stateid;
1349 u64 src_pos;
1350
1351 struct nfs_fh *dst_fh;
1352 nfs4_stateid dst_stateid;
1353 u64 dst_pos;
1354
1355 u64 count;
1356};
1357
1358struct nfs42_write_res {
1359 u64 count;
1360 struct nfs_writeverf verifier;
1361};
1362
1363struct nfs42_copy_res {
1364 struct nfs4_sequence_res seq_res;
1365 struct nfs42_write_res write_res;
1366 bool consecutive;
1367 bool synchronous;
1368};
1369
1346struct nfs42_seek_args { 1370struct nfs42_seek_args {
1347 struct nfs4_sequence_args seq_args; 1371 struct nfs4_sequence_args seq_args;
1348 1372
@@ -1431,7 +1455,7 @@ struct nfs_commit_completion_ops {
1431}; 1455};
1432 1456
1433struct nfs_commit_info { 1457struct nfs_commit_info {
1434 spinlock_t *lock; /* inode->i_lock */ 1458 struct inode *inode; /* Needed for inode->i_lock */
1435 struct nfs_mds_commit_info *mds; 1459 struct nfs_mds_commit_info *mds;
1436 struct pnfs_ds_commit_info *ds; 1460 struct pnfs_ds_commit_info *ds;
1437 struct nfs_direct_req *dreq; /* O_DIRECT request */ 1461 struct nfs_direct_req *dreq; /* O_DIRECT request */
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 6a241a277249..899791573a40 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -127,7 +127,7 @@ struct rpc_authops {
127 void (*destroy)(struct rpc_auth *); 127 void (*destroy)(struct rpc_auth *);
128 128
129 struct rpc_cred * (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int); 129 struct rpc_cred * (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
130 struct rpc_cred * (*crcreate)(struct rpc_auth*, struct auth_cred *, int); 130 struct rpc_cred * (*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
131 int (*list_pseudoflavors)(rpc_authflavor_t *, int); 131 int (*list_pseudoflavors)(rpc_authflavor_t *, int);
132 rpc_authflavor_t (*info2flavor)(struct rpcsec_gss_info *); 132 rpc_authflavor_t (*info2flavor)(struct rpcsec_gss_info *);
133 int (*flavor2info)(rpc_authflavor_t, 133 int (*flavor2info)(rpc_authflavor_t,
@@ -167,6 +167,7 @@ void rpc_destroy_authunix(void);
167 167
168struct rpc_cred * rpc_lookup_cred(void); 168struct rpc_cred * rpc_lookup_cred(void);
169struct rpc_cred * rpc_lookup_cred_nonblock(void); 169struct rpc_cred * rpc_lookup_cred_nonblock(void);
170struct rpc_cred * rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t);
170struct rpc_cred * rpc_lookup_machine_cred(const char *service_name); 171struct rpc_cred * rpc_lookup_machine_cred(const char *service_name);
171int rpcauth_register(const struct rpc_authops *); 172int rpcauth_register(const struct rpc_authops *);
172int rpcauth_unregister(const struct rpc_authops *); 173int rpcauth_unregister(const struct rpc_authops *);
@@ -178,7 +179,7 @@ rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t,
178int rpcauth_get_gssinfo(rpc_authflavor_t, 179int rpcauth_get_gssinfo(rpc_authflavor_t,
179 struct rpcsec_gss_info *); 180 struct rpcsec_gss_info *);
180int rpcauth_list_flavors(rpc_authflavor_t *, int); 181int rpcauth_list_flavors(rpc_authflavor_t *, int);
181struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int); 182struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t);
182void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *); 183void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
183struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int); 184struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int);
184struct rpc_cred * rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int); 185struct rpc_cred * rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
@@ -201,9 +202,28 @@ char * rpcauth_stringify_acceptor(struct rpc_cred *);
201static inline 202static inline
202struct rpc_cred * get_rpccred(struct rpc_cred *cred) 203struct rpc_cred * get_rpccred(struct rpc_cred *cred)
203{ 204{
204 atomic_inc(&cred->cr_count); 205 if (cred != NULL)
206 atomic_inc(&cred->cr_count);
205 return cred; 207 return cred;
206} 208}
207 209
210/**
211 * get_rpccred_rcu - get a reference to a cred using rcu-protected pointer
212 * @cred: cred of which to take a reference
213 *
214 * In some cases, we may have a pointer to a credential to which we
215 * want to take a reference, but don't already have one. Because these
216 * objects are freed using RCU, we can access the cr_count while its
217 * on its way to destruction and only take a reference if it's not already
218 * zero.
219 */
220static inline struct rpc_cred *
221get_rpccred_rcu(struct rpc_cred *cred)
222{
223 if (atomic_inc_not_zero(&cred->cr_count))
224 return cred;
225 return NULL;
226}
227
208#endif /* __KERNEL__ */ 228#endif /* __KERNEL__ */
209#endif /* _LINUX_SUNRPC_AUTH_H */ 229#endif /* _LINUX_SUNRPC_AUTH_H */
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 9a7ddbaf116e..19c659d1c0f8 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -176,6 +176,7 @@ void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
176int rpc_protocol(struct rpc_clnt *); 176int rpc_protocol(struct rpc_clnt *);
177struct net * rpc_net_ns(struct rpc_clnt *); 177struct net * rpc_net_ns(struct rpc_clnt *);
178size_t rpc_max_payload(struct rpc_clnt *); 178size_t rpc_max_payload(struct rpc_clnt *);
179size_t rpc_max_bc_payload(struct rpc_clnt *);
179unsigned long rpc_get_timeout(struct rpc_clnt *clnt); 180unsigned long rpc_get_timeout(struct rpc_clnt *clnt);
180void rpc_force_rebind(struct rpc_clnt *); 181void rpc_force_rebind(struct rpc_clnt *);
181size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t); 182size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index 807371357160..59cbf16eaeb5 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -158,9 +158,9 @@ typedef __be32 rpc_fraghdr;
158 158
159/* 159/*
160 * Note that RFC 1833 does not put any size restrictions on the 160 * Note that RFC 1833 does not put any size restrictions on the
161 * netid string, but all currently defined netid's fit in 4 bytes. 161 * netid string, but all currently defined netid's fit in 5 bytes.
162 */ 162 */
163#define RPCBIND_MAXNETIDLEN (4u) 163#define RPCBIND_MAXNETIDLEN (5u)
164 164
165/* 165/*
166 * Universal addresses are introduced in RFC 1833 and further spelled 166 * Universal addresses are introduced in RFC 1833 and further spelled
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index fb0d212e0d3a..5aa3834619a8 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -142,6 +142,7 @@ struct rpc_xprt_ops {
142 int (*bc_setup)(struct rpc_xprt *xprt, 142 int (*bc_setup)(struct rpc_xprt *xprt,
143 unsigned int min_reqs); 143 unsigned int min_reqs);
144 int (*bc_up)(struct svc_serv *serv, struct net *net); 144 int (*bc_up)(struct svc_serv *serv, struct net *net);
145 size_t (*bc_maxpayload)(struct rpc_xprt *xprt);
145 void (*bc_free_rqst)(struct rpc_rqst *rqst); 146 void (*bc_free_rqst)(struct rpc_rqst *rqst);
146 void (*bc_destroy)(struct rpc_xprt *xprt, 147 void (*bc_destroy)(struct rpc_xprt *xprt,
147 unsigned int max_reqs); 148 unsigned int max_reqs);
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 767190b01363..39267dc3486a 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -52,7 +52,9 @@
52#define RPCRDMA_DEF_SLOT_TABLE (128U) 52#define RPCRDMA_DEF_SLOT_TABLE (128U)
53#define RPCRDMA_MAX_SLOT_TABLE (256U) 53#define RPCRDMA_MAX_SLOT_TABLE (256U)
54 54
55#define RPCRDMA_DEF_INLINE (1024) /* default inline max */ 55#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */
56#define RPCRDMA_DEF_INLINE (1024) /* default inline thresh */
57#define RPCRDMA_MAX_INLINE (3068) /* max inline thresh */
56 58
57/* Memory registration strategies, by number. 59/* Memory registration strategies, by number.
58 * This is part of a kernel / user space API. Do not remove. */ 60 * This is part of a kernel / user space API. Do not remove. */
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 02f53674dc39..040ff627c18a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
543 */ 543 */
544struct rpc_cred * 544struct rpc_cred *
545rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, 545rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
546 int flags) 546 int flags, gfp_t gfp)
547{ 547{
548 LIST_HEAD(free); 548 LIST_HEAD(free);
549 struct rpc_cred_cache *cache = auth->au_credcache; 549 struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
580 if (flags & RPCAUTH_LOOKUP_RCU) 580 if (flags & RPCAUTH_LOOKUP_RCU)
581 return ERR_PTR(-ECHILD); 581 return ERR_PTR(-ECHILD);
582 582
583 new = auth->au_ops->crcreate(auth, acred, flags); 583 new = auth->au_ops->crcreate(auth, acred, flags, gfp);
584 if (IS_ERR(new)) { 584 if (IS_ERR(new)) {
585 cred = new; 585 cred = new;
586 goto out; 586 goto out;
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
703 new = rpcauth_bind_new_cred(task, lookupflags); 703 new = rpcauth_bind_new_cred(task, lookupflags);
704 if (IS_ERR(new)) 704 if (IS_ERR(new))
705 return PTR_ERR(new); 705 return PTR_ERR(new);
706 if (req->rq_cred != NULL) 706 put_rpccred(req->rq_cred);
707 put_rpccred(req->rq_cred);
708 req->rq_cred = new; 707 req->rq_cred = new;
709 return 0; 708 return 0;
710} 709}
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
712void 711void
713put_rpccred(struct rpc_cred *cred) 712put_rpccred(struct rpc_cred *cred)
714{ 713{
714 if (cred == NULL)
715 return;
715 /* Fast path for unhashed credentials */ 716 /* Fast path for unhashed credentials */
716 if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { 717 if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
717 if (atomic_dec_and_test(&cred->cr_count)) 718 if (atomic_dec_and_test(&cred->cr_count))
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 41248b1820c7..54dd3fdead54 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
38} 38}
39EXPORT_SYMBOL_GPL(rpc_lookup_cred); 39EXPORT_SYMBOL_GPL(rpc_lookup_cred);
40 40
41struct rpc_cred *
42rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
43{
44 return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
45}
46EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
47
41struct rpc_cred *rpc_lookup_cred_nonblock(void) 48struct rpc_cred *rpc_lookup_cred_nonblock(void)
42{ 49{
43 return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU); 50 return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
@@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
77static struct rpc_cred * 84static struct rpc_cred *
78generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) 85generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
79{ 86{
80 return rpcauth_lookup_credcache(&generic_auth, acred, flags); 87 return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
81} 88}
82 89
83static struct rpc_cred * 90static struct rpc_cred *
84generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) 91generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
85{ 92{
86 struct generic_cred *gcred; 93 struct generic_cred *gcred;
87 94
88 gcred = kmalloc(sizeof(*gcred), GFP_KERNEL); 95 gcred = kmalloc(sizeof(*gcred), gfp);
89 if (gcred == NULL) 96 if (gcred == NULL)
90 return ERR_PTR(-ENOMEM); 97 return ERR_PTR(-ENOMEM);
91 98
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15612ffa8d57..e64ae93d5b4f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
1299static struct rpc_cred * 1299static struct rpc_cred *
1300gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) 1300gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
1301{ 1301{
1302 return rpcauth_lookup_credcache(auth, acred, flags); 1302 return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
1303} 1303}
1304 1304
1305static struct rpc_cred * 1305static struct rpc_cred *
1306gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) 1306gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
1307{ 1307{
1308 struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); 1308 struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
1309 struct gss_cred *cred = NULL; 1309 struct gss_cred *cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
1313 __func__, from_kuid(&init_user_ns, acred->uid), 1313 __func__, from_kuid(&init_user_ns, acred->uid),
1314 auth->au_flavor); 1314 auth->au_flavor);
1315 1315
1316 if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) 1316 if (!(cred = kzalloc(sizeof(*cred), gfp)))
1317 goto out_err; 1317 goto out_err;
1318 1318
1319 rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); 1319 rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 0d3dd364c22f..9f65452b7cbc 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
52static struct rpc_cred * 52static struct rpc_cred *
53unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) 53unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
54{ 54{
55 return rpcauth_lookup_credcache(auth, acred, flags); 55 return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
56} 56}
57 57
58static struct rpc_cred * 58static struct rpc_cred *
59unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) 59unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
60{ 60{
61 struct unx_cred *cred; 61 struct unx_cred *cred;
62 unsigned int groups = 0; 62 unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
66 from_kuid(&init_user_ns, acred->uid), 66 from_kuid(&init_user_ns, acred->uid),
67 from_kgid(&init_user_ns, acred->gid)); 67 from_kgid(&init_user_ns, acred->gid));
68 68
69 if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS))) 69 if (!(cred = kmalloc(sizeof(*cred), gfp)))
70 return ERR_PTR(-ENOMEM); 70 return ERR_PTR(-ENOMEM);
71 71
72 rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); 72 rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7e0c9bf22df8..06b4df9faaa1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1414,6 +1414,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
1414EXPORT_SYMBOL_GPL(rpc_max_payload); 1414EXPORT_SYMBOL_GPL(rpc_max_payload);
1415 1415
1416/** 1416/**
1417 * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
1418 * @clnt: RPC client to query
1419 */
1420size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
1421{
1422 struct rpc_xprt *xprt;
1423 size_t ret;
1424
1425 rcu_read_lock();
1426 xprt = rcu_dereference(clnt->cl_xprt);
1427 ret = xprt->ops->bc_maxpayload(xprt);
1428 rcu_read_unlock();
1429 return ret;
1430}
1431EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
1432
1433/**
1417 * rpc_get_timeout - Get timeout for transport in units of HZ 1434 * rpc_get_timeout - Get timeout for transport in units of HZ
1418 * @clnt: RPC client to query 1435 * @clnt: RPC client to query
1419 */ 1436 */
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 6bdb3865212d..c4f3cc0c0775 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
797 xdr_set_iov(xdr, buf->head, buf->len); 797 xdr_set_iov(xdr, buf->head, buf->len);
798 else if (buf->page_len != 0) 798 else if (buf->page_len != 0)
799 xdr_set_page_base(xdr, 0, buf->len); 799 xdr_set_page_base(xdr, 0, buf->len);
800 else
801 xdr_set_iov(xdr, buf->head, buf->len);
800 if (p != NULL && p > xdr->p && xdr->end >= p) { 802 if (p != NULL && p > xdr->p && xdr->end >= p) {
801 xdr->nwords -= p - xdr->p; 803 xdr->nwords -= p - xdr->p;
802 xdr->p = p; 804 xdr->p = p;
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcd7640eeb5..87762d976b63 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -192,6 +192,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
192} 192}
193 193
194/** 194/**
195 * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
196 * @xprt: transport
197 *
198 * Returns maximum size, in bytes, of a backchannel message
199 */
200size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
201{
202 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
203 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
204 size_t maxmsg;
205
206 maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
207 return maxmsg - RPCRDMA_HDRLEN_MIN;
208}
209
210/**
195 * rpcrdma_bc_marshal_reply - Send backwards direction reply 211 * rpcrdma_bc_marshal_reply - Send backwards direction reply
196 * @rqst: buffer containing RPC reply data 212 * @rqst: buffer containing RPC reply data
197 * 213 *
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index b289e106540b..6326ebe8b595 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -35,10 +35,71 @@
35/* Maximum scatter/gather per FMR */ 35/* Maximum scatter/gather per FMR */
36#define RPCRDMA_MAX_FMR_SGES (64) 36#define RPCRDMA_MAX_FMR_SGES (64)
37 37
38static struct workqueue_struct *fmr_recovery_wq;
39
40#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
41
42int
43fmr_alloc_recovery_wq(void)
44{
45 fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
46 return !fmr_recovery_wq ? -ENOMEM : 0;
47}
48
49void
50fmr_destroy_recovery_wq(void)
51{
52 struct workqueue_struct *wq;
53
54 if (!fmr_recovery_wq)
55 return;
56
57 wq = fmr_recovery_wq;
58 fmr_recovery_wq = NULL;
59 destroy_workqueue(wq);
60}
61
62static int
63__fmr_unmap(struct rpcrdma_mw *mw)
64{
65 LIST_HEAD(l);
66
67 list_add(&mw->fmr.fmr->list, &l);
68 return ib_unmap_fmr(&l);
69}
70
71/* Deferred reset of a single FMR. Generate a fresh rkey by
72 * replacing the MR. There's no recovery if this fails.
73 */
74static void
75__fmr_recovery_worker(struct work_struct *work)
76{
77 struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
78 mw_work);
79 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
80
81 __fmr_unmap(mw);
82 rpcrdma_put_mw(r_xprt, mw);
83 return;
84}
85
86/* A broken MR was discovered in a context that can't sleep.
87 * Defer recovery to the recovery worker.
88 */
89static void
90__fmr_queue_recovery(struct rpcrdma_mw *mw)
91{
92 INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
93 queue_work(fmr_recovery_wq, &mw->mw_work);
94}
95
38static int 96static int
39fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, 97fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
40 struct rpcrdma_create_data_internal *cdata) 98 struct rpcrdma_create_data_internal *cdata)
41{ 99{
100 rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
101 RPCRDMA_MAX_DATA_SEGS /
102 RPCRDMA_MAX_FMR_SGES));
42 return 0; 103 return 0;
43} 104}
44 105
@@ -48,7 +109,7 @@ static size_t
48fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) 109fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
49{ 110{
50 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 111 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
51 rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); 112 RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
52} 113}
53 114
54static int 115static int
@@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
89 if (IS_ERR(r->fmr.fmr)) 150 if (IS_ERR(r->fmr.fmr))
90 goto out_fmr_err; 151 goto out_fmr_err;
91 152
153 r->mw_xprt = r_xprt;
92 list_add(&r->mw_list, &buf->rb_mws); 154 list_add(&r->mw_list, &buf->rb_mws);
93 list_add(&r->mw_all, &buf->rb_all); 155 list_add(&r->mw_all, &buf->rb_all);
94 } 156 }
@@ -104,15 +166,6 @@ out:
104 return rc; 166 return rc;
105} 167}
106 168
107static int
108__fmr_unmap(struct rpcrdma_mw *r)
109{
110 LIST_HEAD(l);
111
112 list_add(&r->fmr.fmr->list, &l);
113 return ib_unmap_fmr(&l);
114}
115
116/* Use the ib_map_phys_fmr() verb to register a memory region 169/* Use the ib_map_phys_fmr() verb to register a memory region
117 * for remote access via RDMA READ or RDMA WRITE. 170 * for remote access via RDMA READ or RDMA WRITE.
118 */ 171 */
@@ -183,15 +236,10 @@ static void
183__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) 236__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
184{ 237{
185 struct ib_device *device = r_xprt->rx_ia.ri_device; 238 struct ib_device *device = r_xprt->rx_ia.ri_device;
186 struct rpcrdma_mw *mw = seg->rl_mw;
187 int nsegs = seg->mr_nsegs; 239 int nsegs = seg->mr_nsegs;
188 240
189 seg->rl_mw = NULL;
190
191 while (nsegs--) 241 while (nsegs--)
192 rpcrdma_unmap_one(device, seg++); 242 rpcrdma_unmap_one(device, seg++);
193
194 rpcrdma_put_mw(r_xprt, mw);
195} 243}
196 244
197/* Invalidate all memory regions that were registered for "req". 245/* Invalidate all memory regions that were registered for "req".
@@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
234 seg = &req->rl_segments[i]; 282 seg = &req->rl_segments[i];
235 283
236 __fmr_dma_unmap(r_xprt, seg); 284 __fmr_dma_unmap(r_xprt, seg);
285 rpcrdma_put_mw(r_xprt, seg->rl_mw);
237 286
238 i += seg->mr_nsegs; 287 i += seg->mr_nsegs;
239 seg->mr_nsegs = 0; 288 seg->mr_nsegs = 0;
289 seg->rl_mw = NULL;
240 } 290 }
241 291
242 req->rl_nchunks = 0; 292 req->rl_nchunks = 0;
243} 293}
244 294
245/* Use the ib_unmap_fmr() verb to prevent further remote 295/* Use a slow, safe mechanism to invalidate all memory regions
246 * access via RDMA READ or RDMA WRITE. 296 * that were registered for "req".
297 *
298 * In the asynchronous case, DMA unmapping occurs first here
299 * because the rpcrdma_mr_seg is released immediately after this
300 * call. It's contents won't be available in __fmr_dma_unmap later.
301 * FIXME.
247 */ 302 */
248static int 303static void
249fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) 304fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
305 bool sync)
250{ 306{
251 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 307 struct rpcrdma_mr_seg *seg;
252 struct rpcrdma_mr_seg *seg1 = seg; 308 struct rpcrdma_mw *mw;
253 struct rpcrdma_mw *mw = seg1->rl_mw; 309 unsigned int i;
254 int rc, nsegs = seg->mr_nsegs;
255 310
256 dprintk("RPC: %s: FMR %p\n", __func__, mw); 311 for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
312 seg = &req->rl_segments[i];
313 mw = seg->rl_mw;
257 314
258 seg1->rl_mw = NULL; 315 if (sync) {
259 while (seg1->mr_nsegs--) 316 /* ORDER */
260 rpcrdma_unmap_one(ia->ri_device, seg++); 317 __fmr_unmap(mw);
261 rc = __fmr_unmap(mw); 318 __fmr_dma_unmap(r_xprt, seg);
262 if (rc) 319 rpcrdma_put_mw(r_xprt, mw);
263 goto out_err; 320 } else {
264 rpcrdma_put_mw(r_xprt, mw); 321 __fmr_dma_unmap(r_xprt, seg);
265 return nsegs; 322 __fmr_queue_recovery(mw);
323 }
266 324
267out_err: 325 i += seg->mr_nsegs;
268 /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy 326 seg->mr_nsegs = 0;
269 * will attempt to release it when the transport is destroyed. 327 seg->rl_mw = NULL;
270 */ 328 }
271 dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
272 return nsegs;
273} 329}
274 330
275static void 331static void
@@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
295const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { 351const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
296 .ro_map = fmr_op_map, 352 .ro_map = fmr_op_map,
297 .ro_unmap_sync = fmr_op_unmap_sync, 353 .ro_unmap_sync = fmr_op_unmap_sync,
298 .ro_unmap = fmr_op_unmap, 354 .ro_unmap_safe = fmr_op_unmap_safe,
299 .ro_open = fmr_op_open, 355 .ro_open = fmr_op_open,
300 .ro_maxpages = fmr_op_maxpages, 356 .ro_maxpages = fmr_op_maxpages,
301 .ro_init = fmr_op_init, 357 .ro_init = fmr_op_init,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 94c3fa910b85..c0947544babe 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void)
98 destroy_workqueue(wq); 98 destroy_workqueue(wq);
99} 99}
100 100
101static int
102__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
103{
104 struct rpcrdma_frmr *f = &r->frmr;
105 int rc;
106
107 rc = ib_dereg_mr(f->fr_mr);
108 if (rc) {
109 pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
110 rc, r);
111 return rc;
112 }
113
114 f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
115 ia->ri_max_frmr_depth);
116 if (IS_ERR(f->fr_mr)) {
117 pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
118 PTR_ERR(f->fr_mr), r);
119 return PTR_ERR(f->fr_mr);
120 }
121
122 dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
123 f->fr_state = FRMR_IS_INVALID;
124 return 0;
125}
126
127static void
128__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
129{
130 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
131 struct rpcrdma_frmr *f = &mw->frmr;
132 int rc;
133
134 rc = __frwr_reset_mr(ia, mw);
135 ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
136 if (rc)
137 return;
138
139 rpcrdma_put_mw(r_xprt, mw);
140}
141
101/* Deferred reset of a single FRMR. Generate a fresh rkey by 142/* Deferred reset of a single FRMR. Generate a fresh rkey by
102 * replacing the MR. 143 * replacing the MR.
103 * 144 *
@@ -109,26 +150,10 @@ static void
109__frwr_recovery_worker(struct work_struct *work) 150__frwr_recovery_worker(struct work_struct *work)
110{ 151{
111 struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, 152 struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
112 frmr.fr_work); 153 mw_work);
113 struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
114 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
115 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
116
117 if (ib_dereg_mr(r->frmr.fr_mr))
118 goto out_fail;
119 154
120 r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); 155 __frwr_reset_and_unmap(r->mw_xprt, r);
121 if (IS_ERR(r->frmr.fr_mr))
122 goto out_fail;
123
124 dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
125 r->frmr.fr_state = FRMR_IS_INVALID;
126 rpcrdma_put_mw(r_xprt, r);
127 return; 156 return;
128
129out_fail:
130 pr_warn("RPC: %s: FRMR %p unrecovered\n",
131 __func__, r);
132} 157}
133 158
134/* A broken MR was discovered in a context that can't sleep. 159/* A broken MR was discovered in a context that can't sleep.
@@ -137,8 +162,8 @@ out_fail:
137static void 162static void
138__frwr_queue_recovery(struct rpcrdma_mw *r) 163__frwr_queue_recovery(struct rpcrdma_mw *r)
139{ 164{
140 INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker); 165 INIT_WORK(&r->mw_work, __frwr_recovery_worker);
141 queue_work(frwr_recovery_wq, &r->frmr.fr_work); 166 queue_work(frwr_recovery_wq, &r->mw_work);
142} 167}
143 168
144static int 169static int
@@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
152 if (IS_ERR(f->fr_mr)) 177 if (IS_ERR(f->fr_mr))
153 goto out_mr_err; 178 goto out_mr_err;
154 179
155 f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL); 180 f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
156 if (!f->sg) 181 if (!f->fr_sg)
157 goto out_list_err; 182 goto out_list_err;
158 183
159 sg_init_table(f->sg, depth); 184 sg_init_table(f->fr_sg, depth);
160 185
161 init_completion(&f->fr_linv_done); 186 init_completion(&f->fr_linv_done);
162 187
@@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r)
185 if (rc) 210 if (rc)
186 dprintk("RPC: %s: ib_dereg_mr status %i\n", 211 dprintk("RPC: %s: ib_dereg_mr status %i\n",
187 __func__, rc); 212 __func__, rc);
188 kfree(r->frmr.sg); 213 kfree(r->frmr.fr_sg);
189} 214}
190 215
191static int 216static int
@@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
231 depth; 256 depth;
232 } 257 }
233 258
259 rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
260 RPCRDMA_MAX_DATA_SEGS /
261 ia->ri_max_frmr_depth));
234 return 0; 262 return 0;
235} 263}
236 264
@@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
243 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 271 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
244 272
245 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 273 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
246 rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); 274 RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
247} 275}
248 276
249static void 277static void
@@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
350 return rc; 378 return rc;
351 } 379 }
352 380
381 r->mw_xprt = r_xprt;
353 list_add(&r->mw_list, &buf->rb_mws); 382 list_add(&r->mw_list, &buf->rb_mws);
354 list_add(&r->mw_all, &buf->rb_all); 383 list_add(&r->mw_all, &buf->rb_all);
355 r->frmr.fr_xprt = r_xprt;
356 } 384 }
357 385
358 return 0; 386 return 0;
@@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
396 424
397 for (i = 0; i < nsegs;) { 425 for (i = 0; i < nsegs;) {
398 if (seg->mr_page) 426 if (seg->mr_page)
399 sg_set_page(&frmr->sg[i], 427 sg_set_page(&frmr->fr_sg[i],
400 seg->mr_page, 428 seg->mr_page,
401 seg->mr_len, 429 seg->mr_len,
402 offset_in_page(seg->mr_offset)); 430 offset_in_page(seg->mr_offset));
403 else 431 else
404 sg_set_buf(&frmr->sg[i], seg->mr_offset, 432 sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
405 seg->mr_len); 433 seg->mr_len);
406 434
407 ++seg; 435 ++seg;
@@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
412 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 440 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
413 break; 441 break;
414 } 442 }
415 frmr->sg_nents = i; 443 frmr->fr_nents = i;
444 frmr->fr_dir = direction;
416 445
417 dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction); 446 dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
418 if (!dma_nents) { 447 if (!dma_nents) {
419 pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", 448 pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
420 __func__, frmr->sg, frmr->sg_nents); 449 __func__, frmr->fr_sg, frmr->fr_nents);
421 return -ENOMEM; 450 return -ENOMEM;
422 } 451 }
423 452
424 n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); 453 n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
425 if (unlikely(n != frmr->sg_nents)) { 454 if (unlikely(n != frmr->fr_nents)) {
426 pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", 455 pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
427 __func__, frmr->fr_mr, n, frmr->sg_nents); 456 __func__, frmr->fr_mr, n, frmr->fr_nents);
428 rc = n < 0 ? n : -EINVAL; 457 rc = n < 0 ? n : -EINVAL;
429 goto out_senderr; 458 goto out_senderr;
430 } 459 }
431 460
432 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", 461 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
433 __func__, mw, frmr->sg_nents, mr->length); 462 __func__, mw, frmr->fr_nents, mr->length);
434 463
435 key = (u8)(mr->rkey & 0x000000FF); 464 key = (u8)(mr->rkey & 0x000000FF);
436 ib_update_fast_reg_key(mr, ++key); 465 ib_update_fast_reg_key(mr, ++key);
@@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
452 if (rc) 481 if (rc)
453 goto out_senderr; 482 goto out_senderr;
454 483
455 seg1->mr_dir = direction;
456 seg1->rl_mw = mw; 484 seg1->rl_mw = mw;
457 seg1->mr_rkey = mr->rkey; 485 seg1->mr_rkey = mr->rkey;
458 seg1->mr_base = mr->iova; 486 seg1->mr_base = mr->iova;
459 seg1->mr_nsegs = frmr->sg_nents; 487 seg1->mr_nsegs = frmr->fr_nents;
460 seg1->mr_len = mr->length; 488 seg1->mr_len = mr->length;
461 489
462 return frmr->sg_nents; 490 return frmr->fr_nents;
463 491
464out_senderr: 492out_senderr:
465 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); 493 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
466 ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
467 __frwr_queue_recovery(mw); 494 __frwr_queue_recovery(mw);
468 return rc; 495 return rc;
469} 496}
@@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
487 return invalidate_wr; 514 return invalidate_wr;
488} 515}
489 516
490static void
491__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
492 int rc)
493{
494 struct ib_device *device = r_xprt->rx_ia.ri_device;
495 struct rpcrdma_mw *mw = seg->rl_mw;
496 struct rpcrdma_frmr *f = &mw->frmr;
497
498 seg->rl_mw = NULL;
499
500 ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
501
502 if (!rc)
503 rpcrdma_put_mw(r_xprt, mw);
504 else
505 __frwr_queue_recovery(mw);
506}
507
508/* Invalidate all memory regions that were registered for "req". 517/* Invalidate all memory regions that were registered for "req".
509 * 518 *
510 * Sleeps until it is safe for the host CPU to access the 519 * Sleeps until it is safe for the host CPU to access the
@@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
518 struct rpcrdma_mr_seg *seg; 527 struct rpcrdma_mr_seg *seg;
519 unsigned int i, nchunks; 528 unsigned int i, nchunks;
520 struct rpcrdma_frmr *f; 529 struct rpcrdma_frmr *f;
530 struct rpcrdma_mw *mw;
521 int rc; 531 int rc;
522 532
523 dprintk("RPC: %s: req %p\n", __func__, req); 533 dprintk("RPC: %s: req %p\n", __func__, req);
@@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
558 * unless ri_id->qp is a valid pointer. 568 * unless ri_id->qp is a valid pointer.
559 */ 569 */
560 rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); 570 rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
561 if (rc) { 571 if (rc)
562 pr_warn("%s: ib_post_send failed %i\n", __func__, rc); 572 goto reset_mrs;
563 rdma_disconnect(ia->ri_id);
564 goto unmap;
565 }
566 573
567 wait_for_completion(&f->fr_linv_done); 574 wait_for_completion(&f->fr_linv_done);
568 575
@@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
572unmap: 579unmap:
573 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 580 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
574 seg = &req->rl_segments[i]; 581 seg = &req->rl_segments[i];
582 mw = seg->rl_mw;
583 seg->rl_mw = NULL;
575 584
576 __frwr_dma_unmap(r_xprt, seg, rc); 585 ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
586 f->fr_dir);
587 rpcrdma_put_mw(r_xprt, mw);
577 588
578 i += seg->mr_nsegs; 589 i += seg->mr_nsegs;
579 seg->mr_nsegs = 0; 590 seg->mr_nsegs = 0;
580 } 591 }
581 592
582 req->rl_nchunks = 0; 593 req->rl_nchunks = 0;
583} 594 return;
584 595
585/* Post a LOCAL_INV Work Request to prevent further remote access 596reset_mrs:
586 * via RDMA READ or RDMA WRITE. 597 pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
587 */
588static int
589frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
590{
591 struct rpcrdma_mr_seg *seg1 = seg;
592 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
593 struct rpcrdma_mw *mw = seg1->rl_mw;
594 struct rpcrdma_frmr *frmr = &mw->frmr;
595 struct ib_send_wr *invalidate_wr, *bad_wr;
596 int rc, nsegs = seg->mr_nsegs;
597 598
598 dprintk("RPC: %s: FRMR %p\n", __func__, mw); 599 /* Find and reset the MRs in the LOCAL_INV WRs that did not
600 * get posted. This is synchronous, and slow.
601 */
602 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
603 seg = &req->rl_segments[i];
604 mw = seg->rl_mw;
605 f = &mw->frmr;
599 606
600 seg1->rl_mw = NULL; 607 if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
601 frmr->fr_state = FRMR_IS_INVALID; 608 __frwr_reset_mr(ia, mw);
602 invalidate_wr = &mw->frmr.fr_invwr; 609 bad_wr = bad_wr->next;
610 }
603 611
604 memset(invalidate_wr, 0, sizeof(*invalidate_wr)); 612 i += seg->mr_nsegs;
605 frmr->fr_cqe.done = frwr_wc_localinv; 613 }
606 invalidate_wr->wr_cqe = &frmr->fr_cqe; 614 goto unmap;
607 invalidate_wr->opcode = IB_WR_LOCAL_INV; 615}
608 invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
609 DECR_CQCOUNT(&r_xprt->rx_ep);
610 616
611 ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); 617/* Use a slow, safe mechanism to invalidate all memory regions
612 read_lock(&ia->ri_qplock); 618 * that were registered for "req".
613 rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); 619 */
614 read_unlock(&ia->ri_qplock); 620static void
615 if (rc) 621frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
616 goto out_err; 622 bool sync)
623{
624 struct rpcrdma_mr_seg *seg;
625 struct rpcrdma_mw *mw;
626 unsigned int i;
617 627
618 rpcrdma_put_mw(r_xprt, mw); 628 for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
619 return nsegs; 629 seg = &req->rl_segments[i];
630 mw = seg->rl_mw;
620 631
621out_err: 632 if (sync)
622 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); 633 __frwr_reset_and_unmap(r_xprt, mw);
623 __frwr_queue_recovery(mw); 634 else
624 return nsegs; 635 __frwr_queue_recovery(mw);
636
637 i += seg->mr_nsegs;
638 seg->mr_nsegs = 0;
639 seg->rl_mw = NULL;
640 }
625} 641}
626 642
627static void 643static void
@@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
643const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { 659const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
644 .ro_map = frwr_op_map, 660 .ro_map = frwr_op_map,
645 .ro_unmap_sync = frwr_op_unmap_sync, 661 .ro_unmap_sync = frwr_op_unmap_sync,
646 .ro_unmap = frwr_op_unmap, 662 .ro_unmap_safe = frwr_op_unmap_safe,
647 .ro_open = frwr_op_open, 663 .ro_open = frwr_op_open,
648 .ro_maxpages = frwr_op_maxpages, 664 .ro_maxpages = frwr_op_maxpages,
649 .ro_init = frwr_op_init, 665 .ro_init = frwr_op_init,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 481b9b6f4a15..3750596cc432 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
36 __func__, PTR_ERR(mr)); 36 __func__, PTR_ERR(mr));
37 return -ENOMEM; 37 return -ENOMEM;
38 } 38 }
39
40 ia->ri_dma_mr = mr; 39 ia->ri_dma_mr = mr;
40
41 rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
42 RPCRDMA_MAX_DATA_SEGS,
43 RPCRDMA_MAX_HDR_SEGS));
41 return 0; 44 return 0;
42} 45}
43 46
@@ -47,7 +50,7 @@ static size_t
47physical_op_maxpages(struct rpcrdma_xprt *r_xprt) 50physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
48{ 51{
49 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 52 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
50 rpcrdma_max_segments(r_xprt)); 53 RPCRDMA_MAX_HDR_SEGS);
51} 54}
52 55
53static int 56static int
@@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
71 return 1; 74 return 1;
72} 75}
73 76
74/* Unmap a memory region, but leave it registered.
75 */
76static int
77physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
78{
79 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
80
81 rpcrdma_unmap_one(ia->ri_device, seg);
82 return 1;
83}
84
85/* DMA unmap all memory regions that were mapped for "req". 77/* DMA unmap all memory regions that were mapped for "req".
86 */ 78 */
87static void 79static void
@@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
94 rpcrdma_unmap_one(device, &req->rl_segments[i++]); 86 rpcrdma_unmap_one(device, &req->rl_segments[i++]);
95} 87}
96 88
89/* Use a slow, safe mechanism to invalidate all memory regions
90 * that were registered for "req".
91 *
92 * For physical memory registration, there is no good way to
93 * fence a single MR that has been advertised to the server. The
94 * client has already handed the server an R_key that cannot be
95 * invalidated and is shared by all MRs on this connection.
96 * Tearing down the PD might be the only safe choice, but it's
97 * not clear that a freshly acquired DMA R_key would be different
98 * than the one used by the PD that was just destroyed.
99 * FIXME.
100 */
101static void
102physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
103 bool sync)
104{
105 physical_op_unmap_sync(r_xprt, req);
106}
107
97static void 108static void
98physical_op_destroy(struct rpcrdma_buffer *buf) 109physical_op_destroy(struct rpcrdma_buffer *buf)
99{ 110{
@@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
102const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { 113const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
103 .ro_map = physical_op_map, 114 .ro_map = physical_op_map,
104 .ro_unmap_sync = physical_op_unmap_sync, 115 .ro_unmap_sync = physical_op_unmap_sync,
105 .ro_unmap = physical_op_unmap, 116 .ro_unmap_safe = physical_op_unmap_safe,
106 .ro_open = physical_op_open, 117 .ro_open = physical_op_open,
107 .ro_maxpages = physical_op_maxpages, 118 .ro_maxpages = physical_op_maxpages,
108 .ro_init = physical_op_init, 119 .ro_init = physical_op_init,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 888823bb6dae..35a81096e83d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -61,26 +61,84 @@ enum rpcrdma_chunktype {
61 rpcrdma_replych 61 rpcrdma_replych
62}; 62};
63 63
64#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
65static const char transfertypes[][12] = { 64static const char transfertypes[][12] = {
66 "pure inline", /* no chunks */ 65 "inline", /* no chunks */
67 " read chunk", /* some argument via rdma read */ 66 "read list", /* some argument via rdma read */
68 "*read chunk", /* entire request via rdma read */ 67 "*read list", /* entire request via rdma read */
69 "write chunk", /* some result via rdma write */ 68 "write list", /* some result via rdma write */
70 "reply chunk" /* entire reply via rdma write */ 69 "reply chunk" /* entire reply via rdma write */
71}; 70};
72#endif 71
72/* Returns size of largest RPC-over-RDMA header in a Call message
73 *
74 * The largest Call header contains a full-size Read list and a
75 * minimal Reply chunk.
76 */
77static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
78{
79 unsigned int size;
80
81 /* Fixed header fields and list discriminators */
82 size = RPCRDMA_HDRLEN_MIN;
83
84 /* Maximum Read list size */
85 maxsegs += 2; /* segment for head and tail buffers */
86 size = maxsegs * sizeof(struct rpcrdma_read_chunk);
87
88 /* Minimal Read chunk size */
89 size += sizeof(__be32); /* segment count */
90 size += sizeof(struct rpcrdma_segment);
91 size += sizeof(__be32); /* list discriminator */
92
93 dprintk("RPC: %s: max call header size = %u\n",
94 __func__, size);
95 return size;
96}
97
98/* Returns size of largest RPC-over-RDMA header in a Reply message
99 *
100 * There is only one Write list or one Reply chunk per Reply
101 * message. The larger list is the Write list.
102 */
103static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
104{
105 unsigned int size;
106
107 /* Fixed header fields and list discriminators */
108 size = RPCRDMA_HDRLEN_MIN;
109
110 /* Maximum Write list size */
111 maxsegs += 2; /* segment for head and tail buffers */
112 size = sizeof(__be32); /* segment count */
113 size += maxsegs * sizeof(struct rpcrdma_segment);
114 size += sizeof(__be32); /* list discriminator */
115
116 dprintk("RPC: %s: max reply header size = %u\n",
117 __func__, size);
118 return size;
119}
120
121void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
122 struct rpcrdma_create_data_internal *cdata,
123 unsigned int maxsegs)
124{
125 ia->ri_max_inline_write = cdata->inline_wsize -
126 rpcrdma_max_call_header_size(maxsegs);
127 ia->ri_max_inline_read = cdata->inline_rsize -
128 rpcrdma_max_reply_header_size(maxsegs);
129}
73 130
74/* The client can send a request inline as long as the RPCRDMA header 131/* The client can send a request inline as long as the RPCRDMA header
75 * plus the RPC call fit under the transport's inline limit. If the 132 * plus the RPC call fit under the transport's inline limit. If the
76 * combined call message size exceeds that limit, the client must use 133 * combined call message size exceeds that limit, the client must use
77 * the read chunk list for this operation. 134 * the read chunk list for this operation.
78 */ 135 */
79static bool rpcrdma_args_inline(struct rpc_rqst *rqst) 136static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
137 struct rpc_rqst *rqst)
80{ 138{
81 unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len; 139 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
82 140
83 return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); 141 return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
84} 142}
85 143
86/* The client can't know how large the actual reply will be. Thus it 144/* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
89 * limit, the client must provide a write list or a reply chunk for 147 * limit, the client must provide a write list or a reply chunk for
90 * this request. 148 * this request.
91 */ 149 */
92static bool rpcrdma_results_inline(struct rpc_rqst *rqst) 150static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
151 struct rpc_rqst *rqst)
93{ 152{
94 unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen; 153 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
95 154
96 return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst); 155 return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
97} 156}
98 157
99static int 158static int
@@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
226 return n; 285 return n;
227} 286}
228 287
229/* 288static inline __be32 *
230 * Create read/write chunk lists, and reply chunks, for RDMA 289xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
231 * 290{
232 * Assume check against THRESHOLD has been done, and chunks are required. 291 *iptr++ = cpu_to_be32(seg->mr_rkey);
233 * Assume only encoding one list entry for read|write chunks. The NFSv3 292 *iptr++ = cpu_to_be32(seg->mr_len);
234 * protocol is simple enough to allow this as it only has a single "bulk 293 return xdr_encode_hyper(iptr, seg->mr_base);
235 * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The 294}
236 * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) 295
237 * 296/* XDR-encode the Read list. Supports encoding a list of read
238 * When used for a single reply chunk (which is a special write 297 * segments that belong to a single read chunk.
239 * chunk used for the entire reply, rather than just the data), it
240 * is used primarily for READDIR and READLINK which would otherwise
241 * be severely size-limited by a small rdma inline read max. The server
242 * response will come back as an RDMA Write, followed by a message
243 * of type RDMA_NOMSG carrying the xid and length. As a result, reply
244 * chunks do not provide data alignment, however they do not require
245 * "fixup" (moving the response to the upper layer buffer) either.
246 * 298 *
247 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 299 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
248 * 300 *
@@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
250 * N elements, position P (same P for all chunks of same arg!): 302 * N elements, position P (same P for all chunks of same arg!):
251 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 303 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
252 * 304 *
305 * Returns a pointer to the XDR word in the RDMA header following
306 * the end of the Read list, or an error pointer.
307 */
308static __be32 *
309rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
310 struct rpcrdma_req *req, struct rpc_rqst *rqst,
311 __be32 *iptr, enum rpcrdma_chunktype rtype)
312{
313 struct rpcrdma_mr_seg *seg = req->rl_nextseg;
314 unsigned int pos;
315 int n, nsegs;
316
317 if (rtype == rpcrdma_noch) {
318 *iptr++ = xdr_zero; /* item not present */
319 return iptr;
320 }
321
322 pos = rqst->rq_snd_buf.head[0].iov_len;
323 if (rtype == rpcrdma_areadch)
324 pos = 0;
325 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
326 RPCRDMA_MAX_SEGS - req->rl_nchunks);
327 if (nsegs < 0)
328 return ERR_PTR(nsegs);
329
330 do {
331 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
332 if (n <= 0)
333 return ERR_PTR(n);
334
335 *iptr++ = xdr_one; /* item present */
336
337 /* All read segments in this chunk
338 * have the same "position".
339 */
340 *iptr++ = cpu_to_be32(pos);
341 iptr = xdr_encode_rdma_segment(iptr, seg);
342
343 dprintk("RPC: %5u %s: read segment pos %u "
344 "%d@0x%016llx:0x%08x (%s)\n",
345 rqst->rq_task->tk_pid, __func__, pos,
346 seg->mr_len, (unsigned long long)seg->mr_base,
347 seg->mr_rkey, n < nsegs ? "more" : "last");
348
349 r_xprt->rx_stats.read_chunk_count++;
350 req->rl_nchunks++;
351 seg += n;
352 nsegs -= n;
353 } while (nsegs);
354 req->rl_nextseg = seg;
355
356 /* Finish Read list */
357 *iptr++ = xdr_zero; /* Next item not present */
358 return iptr;
359}
360
361/* XDR-encode the Write list. Supports encoding a list containing
362 * one array of plain segments that belong to a single write chunk.
363 *
364 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
365 *
253 * Write chunklist (a list of (one) counted array): 366 * Write chunklist (a list of (one) counted array):
254 * N elements: 367 * N elements:
255 * 1 - N - HLOO - HLOO - ... - HLOO - 0 368 * 1 - N - HLOO - HLOO - ... - HLOO - 0
256 * 369 *
370 * Returns a pointer to the XDR word in the RDMA header following
371 * the end of the Write list, or an error pointer.
372 */
373static __be32 *
374rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
375 struct rpc_rqst *rqst, __be32 *iptr,
376 enum rpcrdma_chunktype wtype)
377{
378 struct rpcrdma_mr_seg *seg = req->rl_nextseg;
379 int n, nsegs, nchunks;
380 __be32 *segcount;
381
382 if (wtype != rpcrdma_writech) {
383 *iptr++ = xdr_zero; /* no Write list present */
384 return iptr;
385 }
386
387 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
388 rqst->rq_rcv_buf.head[0].iov_len,
389 wtype, seg,
390 RPCRDMA_MAX_SEGS - req->rl_nchunks);
391 if (nsegs < 0)
392 return ERR_PTR(nsegs);
393
394 *iptr++ = xdr_one; /* Write list present */
395 segcount = iptr++; /* save location of segment count */
396
397 nchunks = 0;
398 do {
399 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
400 if (n <= 0)
401 return ERR_PTR(n);
402
403 iptr = xdr_encode_rdma_segment(iptr, seg);
404
405 dprintk("RPC: %5u %s: write segment "
406 "%d@0x016%llx:0x%08x (%s)\n",
407 rqst->rq_task->tk_pid, __func__,
408 seg->mr_len, (unsigned long long)seg->mr_base,
409 seg->mr_rkey, n < nsegs ? "more" : "last");
410
411 r_xprt->rx_stats.write_chunk_count++;
412 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
413 req->rl_nchunks++;
414 nchunks++;
415 seg += n;
416 nsegs -= n;
417 } while (nsegs);
418 req->rl_nextseg = seg;
419
420 /* Update count of segments in this Write chunk */
421 *segcount = cpu_to_be32(nchunks);
422
423 /* Finish Write list */
424 *iptr++ = xdr_zero; /* Next item not present */
425 return iptr;
426}
427
428/* XDR-encode the Reply chunk. Supports encoding an array of plain
429 * segments that belong to a single write (reply) chunk.
430 *
431 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
432 *
257 * Reply chunk (a counted array): 433 * Reply chunk (a counted array):
258 * N elements: 434 * N elements:
259 * 1 - N - HLOO - HLOO - ... - HLOO 435 * 1 - N - HLOO - HLOO - ... - HLOO
260 * 436 *
261 * Returns positive RPC/RDMA header size, or negative errno. 437 * Returns a pointer to the XDR word in the RDMA header following
438 * the end of the Reply chunk, or an error pointer.
262 */ 439 */
263 440static __be32 *
264static ssize_t 441rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
265rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, 442 struct rpcrdma_req *req, struct rpc_rqst *rqst,
266 struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) 443 __be32 *iptr, enum rpcrdma_chunktype wtype)
267{ 444{
268 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 445 struct rpcrdma_mr_seg *seg = req->rl_nextseg;
269 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 446 int n, nsegs, nchunks;
270 int n, nsegs, nchunks = 0; 447 __be32 *segcount;
271 unsigned int pos;
272 struct rpcrdma_mr_seg *seg = req->rl_segments;
273 struct rpcrdma_read_chunk *cur_rchunk = NULL;
274 struct rpcrdma_write_array *warray = NULL;
275 struct rpcrdma_write_chunk *cur_wchunk = NULL;
276 __be32 *iptr = headerp->rm_body.rm_chunks;
277 int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
278
279 if (type == rpcrdma_readch || type == rpcrdma_areadch) {
280 /* a read chunk - server will RDMA Read our memory */
281 cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
282 } else {
283 /* a write or reply chunk - server will RDMA Write our memory */
284 *iptr++ = xdr_zero; /* encode a NULL read chunk list */
285 if (type == rpcrdma_replych)
286 *iptr++ = xdr_zero; /* a NULL write chunk list */
287 warray = (struct rpcrdma_write_array *) iptr;
288 cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
289 }
290 448
291 if (type == rpcrdma_replych || type == rpcrdma_areadch) 449 if (wtype != rpcrdma_replych) {
292 pos = 0; 450 *iptr++ = xdr_zero; /* no Reply chunk present */
293 else 451 return iptr;
294 pos = target->head[0].iov_len; 452 }
295 453
296 nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); 454 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
455 RPCRDMA_MAX_SEGS - req->rl_nchunks);
297 if (nsegs < 0) 456 if (nsegs < 0)
298 return nsegs; 457 return ERR_PTR(nsegs);
299 458
300 map = r_xprt->rx_ia.ri_ops->ro_map; 459 *iptr++ = xdr_one; /* Reply chunk present */
460 segcount = iptr++; /* save location of segment count */
461
462 nchunks = 0;
301 do { 463 do {
302 n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); 464 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
303 if (n <= 0) 465 if (n <= 0)
304 goto out; 466 return ERR_PTR(n);
305 if (cur_rchunk) { /* read */ 467
306 cur_rchunk->rc_discrim = xdr_one; 468 iptr = xdr_encode_rdma_segment(iptr, seg);
307 /* all read chunks have the same "position" */ 469
308 cur_rchunk->rc_position = cpu_to_be32(pos); 470 dprintk("RPC: %5u %s: reply segment "
309 cur_rchunk->rc_target.rs_handle = 471 "%d@0x%016llx:0x%08x (%s)\n",
310 cpu_to_be32(seg->mr_rkey); 472 rqst->rq_task->tk_pid, __func__,
311 cur_rchunk->rc_target.rs_length = 473 seg->mr_len, (unsigned long long)seg->mr_base,
312 cpu_to_be32(seg->mr_len); 474 seg->mr_rkey, n < nsegs ? "more" : "last");
313 xdr_encode_hyper( 475
314 (__be32 *)&cur_rchunk->rc_target.rs_offset, 476 r_xprt->rx_stats.reply_chunk_count++;
315 seg->mr_base); 477 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
316 dprintk("RPC: %s: read chunk " 478 req->rl_nchunks++;
317 "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
318 seg->mr_len, (unsigned long long)seg->mr_base,
319 seg->mr_rkey, pos, n < nsegs ? "more" : "last");
320 cur_rchunk++;
321 r_xprt->rx_stats.read_chunk_count++;
322 } else { /* write/reply */
323 cur_wchunk->wc_target.rs_handle =
324 cpu_to_be32(seg->mr_rkey);
325 cur_wchunk->wc_target.rs_length =
326 cpu_to_be32(seg->mr_len);
327 xdr_encode_hyper(
328 (__be32 *)&cur_wchunk->wc_target.rs_offset,
329 seg->mr_base);
330 dprintk("RPC: %s: %s chunk "
331 "elem %d@0x%llx:0x%x (%s)\n", __func__,
332 (type == rpcrdma_replych) ? "reply" : "write",
333 seg->mr_len, (unsigned long long)seg->mr_base,
334 seg->mr_rkey, n < nsegs ? "more" : "last");
335 cur_wchunk++;
336 if (type == rpcrdma_replych)
337 r_xprt->rx_stats.reply_chunk_count++;
338 else
339 r_xprt->rx_stats.write_chunk_count++;
340 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
341 }
342 nchunks++; 479 nchunks++;
343 seg += n; 480 seg += n;
344 nsegs -= n; 481 nsegs -= n;
345 } while (nsegs); 482 } while (nsegs);
483 req->rl_nextseg = seg;
346 484
347 /* success. all failures return above */ 485 /* Update count of segments in the Reply chunk */
348 req->rl_nchunks = nchunks; 486 *segcount = cpu_to_be32(nchunks);
349
350 /*
351 * finish off header. If write, marshal discrim and nchunks.
352 */
353 if (cur_rchunk) {
354 iptr = (__be32 *) cur_rchunk;
355 *iptr++ = xdr_zero; /* finish the read chunk list */
356 *iptr++ = xdr_zero; /* encode a NULL write chunk list */
357 *iptr++ = xdr_zero; /* encode a NULL reply chunk */
358 } else {
359 warray->wc_discrim = xdr_one;
360 warray->wc_nchunks = cpu_to_be32(nchunks);
361 iptr = (__be32 *) cur_wchunk;
362 if (type == rpcrdma_writech) {
363 *iptr++ = xdr_zero; /* finish the write chunk list */
364 *iptr++ = xdr_zero; /* encode a NULL reply chunk */
365 }
366 }
367
368 /*
369 * Return header size.
370 */
371 return (unsigned char *)iptr - (unsigned char *)headerp;
372 487
373out: 488 return iptr;
374 for (pos = 0; nchunks--;)
375 pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
376 &req->rl_segments[pos]);
377 return n;
378} 489}
379 490
380/* 491/*
@@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
440 * Marshal a request: the primary job of this routine is to choose 551 * Marshal a request: the primary job of this routine is to choose
441 * the transfer modes. See comments below. 552 * the transfer modes. See comments below.
442 * 553 *
443 * Uses multiple RDMA IOVs for a request: 554 * Prepares up to two IOVs per Call message:
444 * [0] -- RPC RDMA header, which uses memory from the *start* of the 555 *
445 * preregistered buffer that already holds the RPC data in 556 * [0] -- RPC RDMA header
446 * its middle. 557 * [1] -- the RPC header/data
447 * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
448 * [2] -- optional padding.
449 * [3] -- if padded, header only in [1] and data here.
450 * 558 *
451 * Returns zero on success, otherwise a negative errno. 559 * Returns zero on success, otherwise a negative errno.
452 */ 560 */
@@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
457 struct rpc_xprt *xprt = rqst->rq_xprt; 565 struct rpc_xprt *xprt = rqst->rq_xprt;
458 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 566 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
459 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 567 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
460 char *base;
461 size_t rpclen;
462 ssize_t hdrlen;
463 enum rpcrdma_chunktype rtype, wtype; 568 enum rpcrdma_chunktype rtype, wtype;
464 struct rpcrdma_msg *headerp; 569 struct rpcrdma_msg *headerp;
570 ssize_t hdrlen;
571 size_t rpclen;
572 __be32 *iptr;
465 573
466#if defined(CONFIG_SUNRPC_BACKCHANNEL) 574#if defined(CONFIG_SUNRPC_BACKCHANNEL)
467 if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) 575 if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
468 return rpcrdma_bc_marshal_reply(rqst); 576 return rpcrdma_bc_marshal_reply(rqst);
469#endif 577#endif
470 578
471 /*
472 * rpclen gets amount of data in first buffer, which is the
473 * pre-registered buffer.
474 */
475 base = rqst->rq_svec[0].iov_base;
476 rpclen = rqst->rq_svec[0].iov_len;
477
478 headerp = rdmab_to_msg(req->rl_rdmabuf); 579 headerp = rdmab_to_msg(req->rl_rdmabuf);
479 /* don't byte-swap XID, it's already done in request */ 580 /* don't byte-swap XID, it's already done in request */
480 headerp->rm_xid = rqst->rq_xid; 581 headerp->rm_xid = rqst->rq_xid;
@@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
485 /* 586 /*
486 * Chunks needed for results? 587 * Chunks needed for results?
487 * 588 *
488 * o Read ops return data as write chunk(s), header as inline.
489 * o If the expected result is under the inline threshold, all ops 589 * o If the expected result is under the inline threshold, all ops
490 * return as inline. 590 * return as inline.
591 * o Large read ops return data as write chunk(s), header as
592 * inline.
491 * o Large non-read ops return as a single reply chunk. 593 * o Large non-read ops return as a single reply chunk.
492 */ 594 */
493 if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 595 if (rpcrdma_results_inline(r_xprt, rqst))
494 wtype = rpcrdma_writech;
495 else if (rpcrdma_results_inline(rqst))
496 wtype = rpcrdma_noch; 596 wtype = rpcrdma_noch;
597 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
598 wtype = rpcrdma_writech;
497 else 599 else
498 wtype = rpcrdma_replych; 600 wtype = rpcrdma_replych;
499 601
@@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
511 * that both has a data payload, and whose non-data arguments 613 * that both has a data payload, and whose non-data arguments
512 * by themselves are larger than the inline threshold. 614 * by themselves are larger than the inline threshold.
513 */ 615 */
514 if (rpcrdma_args_inline(rqst)) { 616 if (rpcrdma_args_inline(r_xprt, rqst)) {
515 rtype = rpcrdma_noch; 617 rtype = rpcrdma_noch;
618 rpcrdma_inline_pullup(rqst);
619 rpclen = rqst->rq_svec[0].iov_len;
516 } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 620 } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
517 rtype = rpcrdma_readch; 621 rtype = rpcrdma_readch;
622 rpclen = rqst->rq_svec[0].iov_len;
623 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
518 } else { 624 } else {
519 r_xprt->rx_stats.nomsg_call_count++; 625 r_xprt->rx_stats.nomsg_call_count++;
520 headerp->rm_type = htonl(RDMA_NOMSG); 626 headerp->rm_type = htonl(RDMA_NOMSG);
@@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
522 rpclen = 0; 628 rpclen = 0;
523 } 629 }
524 630
525 /* The following simplification is not true forever */ 631 /* This implementation supports the following combinations
526 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) 632 * of chunk lists in one RPC-over-RDMA Call message:
527 wtype = rpcrdma_noch; 633 *
528 if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { 634 * - Read list
529 dprintk("RPC: %s: cannot marshal multiple chunk lists\n", 635 * - Write list
530 __func__); 636 * - Reply chunk
531 return -EIO; 637 * - Read list + Reply chunk
532 } 638 *
533 639 * It might not yet support the following combinations:
534 hdrlen = RPCRDMA_HDRLEN_MIN; 640 *
535 641 * - Read list + Write list
536 /* 642 *
537 * Pull up any extra send data into the preregistered buffer. 643 * It does not support the following combinations:
538 * When padding is in use and applies to the transfer, insert 644 *
539 * it and change the message type. 645 * - Write list + Reply chunk
646 * - Read list + Write list + Reply chunk
647 *
648 * This implementation supports only a single chunk in each
649 * Read or Write list. Thus for example the client cannot
650 * send a Call message with a Position Zero Read chunk and a
651 * regular Read chunk at the same time.
540 */ 652 */
541 if (rtype == rpcrdma_noch) { 653 req->rl_nchunks = 0;
542 654 req->rl_nextseg = req->rl_segments;
543 rpcrdma_inline_pullup(rqst); 655 iptr = headerp->rm_body.rm_chunks;
544 656 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
545 headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; 657 if (IS_ERR(iptr))
546 headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; 658 goto out_unmap;
547 headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; 659 iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
548 /* new length after pullup */ 660 if (IS_ERR(iptr))
549 rpclen = rqst->rq_svec[0].iov_len; 661 goto out_unmap;
550 } else if (rtype == rpcrdma_readch) 662 iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
551 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); 663 if (IS_ERR(iptr))
552 if (rtype != rpcrdma_noch) { 664 goto out_unmap;
553 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, 665 hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
554 headerp, rtype); 666
555 wtype = rtype; /* simplify dprintk */ 667 if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
556 668 goto out_overflow;
557 } else if (wtype != rpcrdma_noch) { 669
558 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, 670 dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
559 headerp, wtype); 671 rqst->rq_task->tk_pid, __func__,
560 } 672 transfertypes[rtype], transfertypes[wtype],
561 if (hdrlen < 0) 673 hdrlen, rpclen);
562 return hdrlen;
563 674
564 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
565 " headerp 0x%p base 0x%p lkey 0x%x\n",
566 __func__, transfertypes[wtype], hdrlen, rpclen,
567 headerp, base, rdmab_lkey(req->rl_rdmabuf));
568
569 /*
570 * initialize send_iov's - normally only two: rdma chunk header and
571 * single preregistered RPC header buffer, but if padding is present,
572 * then use a preregistered (and zeroed) pad buffer between the RPC
573 * header and any write data. In all non-rdma cases, any following
574 * data has been copied into the RPC header buffer.
575 */
576 req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); 675 req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
577 req->rl_send_iov[0].length = hdrlen; 676 req->rl_send_iov[0].length = hdrlen;
578 req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); 677 req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
@@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
587 686
588 req->rl_niovs = 2; 687 req->rl_niovs = 2;
589 return 0; 688 return 0;
689
690out_overflow:
691 pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
692 hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
693 /* Terminate this RPC. Chunks registered above will be
694 * released by xprt_release -> xprt_rmda_free .
695 */
696 return -EIO;
697
698out_unmap:
699 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
700 return PTR_ERR(iptr);
590} 701}
591 702
592/* 703/*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b1b009f10ea3..99d2e5b72726 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
73 73
74static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; 74static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
75static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; 75static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
76static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
77static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
76static unsigned int zero; 78static unsigned int zero;
77static unsigned int max_padding = PAGE_SIZE; 79static unsigned int max_padding = PAGE_SIZE;
78static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; 80static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
96 .maxlen = sizeof(unsigned int), 98 .maxlen = sizeof(unsigned int),
97 .mode = 0644, 99 .mode = 0644,
98 .proc_handler = proc_dointvec, 100 .proc_handler = proc_dointvec,
101 .extra1 = &min_inline_size,
102 .extra2 = &max_inline_size,
99 }, 103 },
100 { 104 {
101 .procname = "rdma_max_inline_write", 105 .procname = "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
103 .maxlen = sizeof(unsigned int), 107 .maxlen = sizeof(unsigned int),
104 .mode = 0644, 108 .mode = 0644,
105 .proc_handler = proc_dointvec, 109 .proc_handler = proc_dointvec,
110 .extra1 = &min_inline_size,
111 .extra2 = &max_inline_size,
106 }, 112 },
107 { 113 {
108 .procname = "rdma_inline_write_padding", 114 .procname = "rdma_inline_write_padding",
@@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
508out: 514out:
509 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); 515 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
510 req->rl_connect_cookie = 0; /* our reserved value */ 516 req->rl_connect_cookie = 0; /* our reserved value */
517 req->rl_task = task;
511 return req->rl_sendbuf->rg_base; 518 return req->rl_sendbuf->rg_base;
512 519
513out_rdmabuf: 520out_rdmabuf:
@@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer)
564 struct rpcrdma_req *req; 571 struct rpcrdma_req *req;
565 struct rpcrdma_xprt *r_xprt; 572 struct rpcrdma_xprt *r_xprt;
566 struct rpcrdma_regbuf *rb; 573 struct rpcrdma_regbuf *rb;
567 int i;
568 574
569 if (buffer == NULL) 575 if (buffer == NULL)
570 return; 576 return;
@@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer)
578 584
579 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); 585 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
580 586
581 for (i = 0; req->rl_nchunks;) { 587 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
582 --req->rl_nchunks; 588 !RPC_IS_ASYNC(req->rl_task));
583 i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
584 &req->rl_segments[i]);
585 }
586 589
587 rpcrdma_buffer_put(req); 590 rpcrdma_buffer_put(req);
588} 591}
@@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
707#if defined(CONFIG_SUNRPC_BACKCHANNEL) 710#if defined(CONFIG_SUNRPC_BACKCHANNEL)
708 .bc_setup = xprt_rdma_bc_setup, 711 .bc_setup = xprt_rdma_bc_setup,
709 .bc_up = xprt_rdma_bc_up, 712 .bc_up = xprt_rdma_bc_up,
713 .bc_maxpayload = xprt_rdma_bc_maxpayload,
710 .bc_free_rqst = xprt_rdma_bc_free_rqst, 714 .bc_free_rqst = xprt_rdma_bc_free_rqst,
711 .bc_destroy = xprt_rdma_bc_destroy, 715 .bc_destroy = xprt_rdma_bc_destroy,
712#endif 716#endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f5ed9f982cd7..b044d98a1370 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -203,15 +203,6 @@ out_fail:
203 goto out_schedule; 203 goto out_schedule;
204} 204}
205 205
206static void
207rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
208{
209 struct ib_wc wc;
210
211 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
212 rpcrdma_receive_wc(NULL, &wc);
213}
214
215static int 206static int
216rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 207rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
217{ 208{
@@ -374,23 +365,6 @@ out:
374} 365}
375 366
376/* 367/*
377 * Drain any cq, prior to teardown.
378 */
379static void
380rpcrdma_clean_cq(struct ib_cq *cq)
381{
382 struct ib_wc wc;
383 int count = 0;
384
385 while (1 == ib_poll_cq(cq, 1, &wc))
386 ++count;
387
388 if (count)
389 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
390 __func__, count, wc.opcode);
391}
392
393/*
394 * Exported functions. 368 * Exported functions.
395 */ 369 */
396 370
@@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
459 dprintk("RPC: %s: memory registration strategy is '%s'\n", 433 dprintk("RPC: %s: memory registration strategy is '%s'\n",
460 __func__, ia->ri_ops->ro_displayname); 434 __func__, ia->ri_ops->ro_displayname);
461 435
462 rwlock_init(&ia->ri_qplock);
463 return 0; 436 return 0;
464 437
465out3: 438out3:
@@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
515 __func__); 488 __func__);
516 return -ENOMEM; 489 return -ENOMEM;
517 } 490 }
518 max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS; 491 max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
519 492
520 /* check provider's send/recv wr limits */ 493 /* check provider's send/recv wr limits */
521 if (cdata->max_requests > max_qp_wr) 494 if (cdata->max_requests > max_qp_wr)
@@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
526 ep->rep_attr.srq = NULL; 499 ep->rep_attr.srq = NULL;
527 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 500 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
528 ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; 501 ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
502 ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
529 rc = ia->ri_ops->ro_open(ia, ep, cdata); 503 rc = ia->ri_ops->ro_open(ia, ep, cdata);
530 if (rc) 504 if (rc)
531 return rc; 505 return rc;
532 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 506 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
533 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; 507 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
508 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
534 ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; 509 ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
535 ep->rep_attr.cap.max_recv_sge = 1; 510 ep->rep_attr.cap.max_recv_sge = 1;
536 ep->rep_attr.cap.max_inline_data = 0; 511 ep->rep_attr.cap.max_inline_data = 0;
@@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
578 ep->rep_attr.recv_cq = recvcq; 553 ep->rep_attr.recv_cq = recvcq;
579 554
580 /* Initialize cma parameters */ 555 /* Initialize cma parameters */
556 memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
581 557
582 /* RPC/RDMA does not use private data */ 558 /* RPC/RDMA does not use private data */
583 ep->rep_remote_cma.private_data = NULL; 559 ep->rep_remote_cma.private_data = NULL;
@@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
591 ep->rep_remote_cma.responder_resources = 567 ep->rep_remote_cma.responder_resources =
592 ia->ri_device->attrs.max_qp_rd_atom; 568 ia->ri_device->attrs.max_qp_rd_atom;
593 569
594 ep->rep_remote_cma.retry_count = 7; 570 /* Limit transport retries so client can detect server
571 * GID changes quickly. RPC layer handles re-establishing
572 * transport connection and retransmission.
573 */
574 ep->rep_remote_cma.retry_count = 6;
575
576 /* RPC-over-RDMA handles its own flow control. In addition,
577 * make all RNR NAKs visible so we know that RPC-over-RDMA
578 * flow control is working correctly (no NAKs should be seen).
579 */
595 ep->rep_remote_cma.flow_control = 0; 580 ep->rep_remote_cma.flow_control = 0;
596 ep->rep_remote_cma.rnr_retry_count = 0; 581 ep->rep_remote_cma.rnr_retry_count = 0;
597 582
@@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
622 607
623 cancel_delayed_work_sync(&ep->rep_connect_worker); 608 cancel_delayed_work_sync(&ep->rep_connect_worker);
624 609
625 if (ia->ri_id->qp)
626 rpcrdma_ep_disconnect(ep, ia);
627
628 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
629 rpcrdma_clean_cq(ep->rep_attr.send_cq);
630
631 if (ia->ri_id->qp) { 610 if (ia->ri_id->qp) {
611 rpcrdma_ep_disconnect(ep, ia);
632 rdma_destroy_qp(ia->ri_id); 612 rdma_destroy_qp(ia->ri_id);
633 ia->ri_id->qp = NULL; 613 ia->ri_id->qp = NULL;
634 } 614 }
@@ -659,7 +639,6 @@ retry:
659 dprintk("RPC: %s: reconnecting...\n", __func__); 639 dprintk("RPC: %s: reconnecting...\n", __func__);
660 640
661 rpcrdma_ep_disconnect(ep, ia); 641 rpcrdma_ep_disconnect(ep, ia);
662 rpcrdma_flush_cqs(ep);
663 642
664 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 643 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
665 id = rpcrdma_create_id(xprt, ia, 644 id = rpcrdma_create_id(xprt, ia,
@@ -692,10 +671,8 @@ retry:
692 goto out; 671 goto out;
693 } 672 }
694 673
695 write_lock(&ia->ri_qplock);
696 old = ia->ri_id; 674 old = ia->ri_id;
697 ia->ri_id = id; 675 ia->ri_id = id;
698 write_unlock(&ia->ri_qplock);
699 676
700 rdma_destroy_qp(old); 677 rdma_destroy_qp(old);
701 rpcrdma_destroy_id(old); 678 rpcrdma_destroy_id(old);
@@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
785{ 762{
786 int rc; 763 int rc;
787 764
788 rpcrdma_flush_cqs(ep);
789 rc = rdma_disconnect(ia->ri_id); 765 rc = rdma_disconnect(ia->ri_id);
790 if (!rc) { 766 if (!rc) {
791 /* returns without wait if not connected */ 767 /* returns without wait if not connected */
@@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
797 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); 773 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
798 ep->rep_connected = rc; 774 ep->rep_connected = rc;
799 } 775 }
776
777 ib_drain_qp(ia->ri_id->qp);
800} 778}
801 779
802struct rpcrdma_req * 780struct rpcrdma_req *
@@ -1271,25 +1249,3 @@ out_rc:
1271 rpcrdma_recv_buffer_put(rep); 1249 rpcrdma_recv_buffer_put(rep);
1272 return rc; 1250 return rc;
1273} 1251}
1274
1275/* How many chunk list items fit within our inline buffers?
1276 */
1277unsigned int
1278rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
1279{
1280 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1281 int bytes, segments;
1282
1283 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
1284 bytes -= RPCRDMA_HDRLEN_MIN;
1285 if (bytes < sizeof(struct rpcrdma_segment) * 2) {
1286 pr_warn("RPC: %s: inline threshold too small\n",
1287 __func__);
1288 return 0;
1289 }
1290
1291 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
1292 dprintk("RPC: %s: max chunk list size = %d segments\n",
1293 __func__, segments);
1294 return segments;
1295}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 2ebc743cb96f..95cdc66225ee 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,7 +65,6 @@
65 */ 65 */
66struct rpcrdma_ia { 66struct rpcrdma_ia {
67 const struct rpcrdma_memreg_ops *ri_ops; 67 const struct rpcrdma_memreg_ops *ri_ops;
68 rwlock_t ri_qplock;
69 struct ib_device *ri_device; 68 struct ib_device *ri_device;
70 struct rdma_cm_id *ri_id; 69 struct rdma_cm_id *ri_id;
71 struct ib_pd *ri_pd; 70 struct ib_pd *ri_pd;
@@ -73,6 +72,8 @@ struct rpcrdma_ia {
73 struct completion ri_done; 72 struct completion ri_done;
74 int ri_async_rc; 73 int ri_async_rc;
75 unsigned int ri_max_frmr_depth; 74 unsigned int ri_max_frmr_depth;
75 unsigned int ri_max_inline_write;
76 unsigned int ri_max_inline_read;
76 struct ib_qp_attr ri_qp_attr; 77 struct ib_qp_attr ri_qp_attr;
77 struct ib_qp_init_attr ri_qp_init_attr; 78 struct ib_qp_init_attr ri_qp_init_attr;
78}; 79};
@@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
144 145
145#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) 146#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
146 147
148/* To ensure a transport can always make forward progress,
149 * the number of RDMA segments allowed in header chunk lists
150 * is capped at 8. This prevents less-capable devices and
151 * memory registrations from overrunning the Send buffer
152 * while building chunk lists.
153 *
154 * Elements of the Read list take up more room than the
155 * Write list or Reply chunk. 8 read segments means the Read
156 * list (or Write list or Reply chunk) cannot consume more
157 * than
158 *
159 * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
160 *
161 * And the fixed part of the header is another 24 bytes.
162 *
163 * The smallest inline threshold is 1024 bytes, ensuring that
164 * at least 750 bytes are available for RPC messages.
165 */
166#define RPCRDMA_MAX_HDR_SEGS (8)
167
147/* 168/*
148 * struct rpcrdma_rep -- this structure encapsulates state required to recv 169 * struct rpcrdma_rep -- this structure encapsulates state required to recv
149 * and complete a reply, asychronously. It needs several pieces of 170 * and complete a reply, asychronously. It needs several pieces of
@@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
162 */ 183 */
163 184
164#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) 185#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
165#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ 186
187/* data segments + head/tail for Call + head/tail for Reply */
188#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
166 189
167struct rpcrdma_buffer; 190struct rpcrdma_buffer;
168 191
@@ -198,14 +221,13 @@ enum rpcrdma_frmr_state {
198}; 221};
199 222
200struct rpcrdma_frmr { 223struct rpcrdma_frmr {
201 struct scatterlist *sg; 224 struct scatterlist *fr_sg;
202 int sg_nents; 225 int fr_nents;
226 enum dma_data_direction fr_dir;
203 struct ib_mr *fr_mr; 227 struct ib_mr *fr_mr;
204 struct ib_cqe fr_cqe; 228 struct ib_cqe fr_cqe;
205 enum rpcrdma_frmr_state fr_state; 229 enum rpcrdma_frmr_state fr_state;
206 struct completion fr_linv_done; 230 struct completion fr_linv_done;
207 struct work_struct fr_work;
208 struct rpcrdma_xprt *fr_xprt;
209 union { 231 union {
210 struct ib_reg_wr fr_regwr; 232 struct ib_reg_wr fr_regwr;
211 struct ib_send_wr fr_invwr; 233 struct ib_send_wr fr_invwr;
@@ -222,6 +244,8 @@ struct rpcrdma_mw {
222 struct rpcrdma_fmr fmr; 244 struct rpcrdma_fmr fmr;
223 struct rpcrdma_frmr frmr; 245 struct rpcrdma_frmr frmr;
224 }; 246 };
247 struct work_struct mw_work;
248 struct rpcrdma_xprt *mw_xprt;
225 struct list_head mw_list; 249 struct list_head mw_list;
226 struct list_head mw_all; 250 struct list_head mw_all;
227}; 251};
@@ -270,12 +294,14 @@ struct rpcrdma_req {
270 unsigned int rl_niovs; 294 unsigned int rl_niovs;
271 unsigned int rl_nchunks; 295 unsigned int rl_nchunks;
272 unsigned int rl_connect_cookie; 296 unsigned int rl_connect_cookie;
297 struct rpc_task *rl_task;
273 struct rpcrdma_buffer *rl_buffer; 298 struct rpcrdma_buffer *rl_buffer;
274 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 299 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
275 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; 300 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
276 struct rpcrdma_regbuf *rl_rdmabuf; 301 struct rpcrdma_regbuf *rl_rdmabuf;
277 struct rpcrdma_regbuf *rl_sendbuf; 302 struct rpcrdma_regbuf *rl_sendbuf;
278 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 303 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
304 struct rpcrdma_mr_seg *rl_nextseg;
279 305
280 struct ib_cqe rl_cqe; 306 struct ib_cqe rl_cqe;
281 struct list_head rl_all; 307 struct list_head rl_all;
@@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops {
372 struct rpcrdma_mr_seg *, int, bool); 398 struct rpcrdma_mr_seg *, int, bool);
373 void (*ro_unmap_sync)(struct rpcrdma_xprt *, 399 void (*ro_unmap_sync)(struct rpcrdma_xprt *,
374 struct rpcrdma_req *); 400 struct rpcrdma_req *);
375 int (*ro_unmap)(struct rpcrdma_xprt *, 401 void (*ro_unmap_safe)(struct rpcrdma_xprt *,
376 struct rpcrdma_mr_seg *); 402 struct rpcrdma_req *, bool);
377 int (*ro_open)(struct rpcrdma_ia *, 403 int (*ro_open)(struct rpcrdma_ia *,
378 struct rpcrdma_ep *, 404 struct rpcrdma_ep *,
379 struct rpcrdma_create_data_internal *); 405 struct rpcrdma_create_data_internal *);
@@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
456void rpcrdma_free_regbuf(struct rpcrdma_ia *, 482void rpcrdma_free_regbuf(struct rpcrdma_ia *,
457 struct rpcrdma_regbuf *); 483 struct rpcrdma_regbuf *);
458 484
459unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
460int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); 485int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
461 486
462int frwr_alloc_recovery_wq(void); 487int frwr_alloc_recovery_wq(void);
@@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
519 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 544 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
520 */ 545 */
521int rpcrdma_marshal_req(struct rpc_rqst *); 546int rpcrdma_marshal_req(struct rpc_rqst *);
547void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
548 struct rpcrdma_create_data_internal *,
549 unsigned int);
522 550
523/* RPC/RDMA module init - xprtrdma/transport.c 551/* RPC/RDMA module init - xprtrdma/transport.c
524 */ 552 */
@@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void);
534#if defined(CONFIG_SUNRPC_BACKCHANNEL) 562#if defined(CONFIG_SUNRPC_BACKCHANNEL)
535int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); 563int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
536int xprt_rdma_bc_up(struct svc_serv *, struct net *); 564int xprt_rdma_bc_up(struct svc_serv *, struct net *);
565size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
537int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); 566int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
538void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); 567void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
539int rpcrdma_bc_marshal_reply(struct rpc_rqst *); 568int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b90c5397b5e1..2d3e0c42361e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1364,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
1364 return ret; 1364 return ret;
1365 return 0; 1365 return 0;
1366} 1366}
1367
1368static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
1369{
1370 return PAGE_SIZE;
1371}
1367#else 1372#else
1368static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, 1373static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
1369 struct xdr_skb_reader *desc) 1374 struct xdr_skb_reader *desc)
@@ -2661,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
2661#ifdef CONFIG_SUNRPC_BACKCHANNEL 2666#ifdef CONFIG_SUNRPC_BACKCHANNEL
2662 .bc_setup = xprt_setup_bc, 2667 .bc_setup = xprt_setup_bc,
2663 .bc_up = xs_tcp_bc_up, 2668 .bc_up = xs_tcp_bc_up,
2669 .bc_maxpayload = xs_tcp_bc_maxpayload,
2664 .bc_free_rqst = xprt_free_bc_rqst, 2670 .bc_free_rqst = xprt_free_bc_rqst,
2665 .bc_destroy = xprt_destroy_bc, 2671 .bc_destroy = xprt_destroy_bc,
2666#endif 2672#endif