summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-26 15:20:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-26 15:20:14 -0400
commit972a2bf7dfe39ebf49dd47f68d27c416392e53b1 (patch)
tree1fc6277f7b24c854b3c48a9e082b6625c18145a0
parent7be3cb019db1cbd5fd5ffe6d64a23fefa4b6f229 (diff)
parenta8fd0feeca35cb8f9ddd950191f4aeb777f52f89 (diff)
Merge tag 'nfs-for-5.4-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
Pull NFS client updates from Anna Schumaker: "Stable bugfixes: - Dequeue the request from the receive queue while we're re-encoding # v4.20+ - Fix buffer handling of GSS MIC without slack # 5.1 Features: - Increase xprtrdma maximum transport header and slot table sizes - Add support for nfs4_call_sync() calls using a custom rpc_task_struct - Optimize the default readahead size - Enable pNFS filelayout LAYOUTGET on OPEN Other bugfixes and cleanups: - Fix possible null-pointer dereferences and memory leaks - Various NFS over RDMA cleanups - Various NFS over RDMA comment updates - Don't receive TCP data into a reset request buffer - Don't try to parse incomplete RPC messages - Fix congestion window race with disconnect - Clean up pNFS return-on-close error handling - Fixes for NFS4ERR_OLD_STATEID handling" * tag 'nfs-for-5.4-1' of git://git.linux-nfs.org/projects/anna/linux-nfs: (53 commits) pNFS/filelayout: enable LAYOUTGET on OPEN NFS: Optimise the default readahead size NFSv4: Handle NFS4ERR_OLD_STATEID in LOCKU NFSv4: Handle NFS4ERR_OLD_STATEID in CLOSE/OPEN_DOWNGRADE NFSv4: Fix OPEN_DOWNGRADE error handling pNFS: Handle NFS4ERR_OLD_STATEID on layoutreturn by bumping the state seqid NFSv4: Add a helper to increment stateid seqids NFSv4: Handle RPC level errors in LAYOUTRETURN NFSv4: Handle NFS4ERR_DELAY correctly in return-on-close NFSv4: Clean up pNFS return-on-close error handling pNFS: Ensure we do clear the return-on-close layout stateid on fatal errors NFS: remove unused check for negative dentry NFSv3: use nfs_add_or_obtain() to create and reference inodes NFS: Refactor nfs_instantiate() for dentry referencing callers SUNRPC: Fix congestion window race with disconnect SUNRPC: Don't try to parse incomplete RPC messages SUNRPC: Rename xdr_buf_read_netobj to xdr_buf_read_mic SUNRPC: Fix buffer handling of GSS MIC without slack SUNRPC: RPC level errors should always set task->tk_rpc_status SUNRPC: Don't receive TCP data into a request buffer that has been reset ...
-rw-r--r--fs/nfs/dir.c41
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/nfs3proc.c45
-rw-r--r--fs/nfs/nfs4_fs.h11
-rw-r--r--fs/nfs/nfs4proc.c315
-rw-r--r--fs/nfs/nfs4state.c22
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/pnfs.c71
-rw-r--r--fs/nfs/pnfs.h17
-rw-r--r--fs/nfs/super.c9
-rw-r--r--include/linux/nfs_fs.h3
-rw-r--r--include/linux/sunrpc/sched.h3
-rw-r--r--include/linux/sunrpc/xdr.h2
-rw-r--r--include/linux/sunrpc/xprt.h1
-rw-r--r--include/linux/sunrpc/xprtrdma.h4
-rw-r--r--include/trace/events/rpcrdma.h88
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/clnt.c26
-rw-r--r--net/sunrpc/sched.c32
-rw-r--r--net/sunrpc/xdr.c65
-rw-r--r--net/sunrpc/xprt.c61
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c4
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c166
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c71
-rw-r--r--net/sunrpc/xprtrdma/transport.c15
-rw-r--r--net/sunrpc/xprtrdma/verbs.c263
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h59
-rw-r--r--net/sunrpc/xprtsock.c8
29 files changed, 835 insertions, 580 deletions
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0adfd8840110..e180033e35cf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1669,10 +1669,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1669 1669
1670#endif /* CONFIG_NFSV4 */ 1670#endif /* CONFIG_NFSV4 */
1671 1671
1672/* 1672struct dentry *
1673 * Code common to create, mkdir, and mknod. 1673nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
1674 */
1675int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1676 struct nfs_fattr *fattr, 1674 struct nfs_fattr *fattr,
1677 struct nfs4_label *label) 1675 struct nfs4_label *label)
1678{ 1676{
@@ -1680,13 +1678,10 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1680 struct inode *dir = d_inode(parent); 1678 struct inode *dir = d_inode(parent);
1681 struct inode *inode; 1679 struct inode *inode;
1682 struct dentry *d; 1680 struct dentry *d;
1683 int error = -EACCES; 1681 int error;
1684 1682
1685 d_drop(dentry); 1683 d_drop(dentry);
1686 1684
1687 /* We may have been initialized further down */
1688 if (d_really_is_positive(dentry))
1689 goto out;
1690 if (fhandle->size == 0) { 1685 if (fhandle->size == 0) {
1691 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); 1686 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
1692 if (error) 1687 if (error)
@@ -1702,18 +1697,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1702 } 1697 }
1703 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); 1698 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
1704 d = d_splice_alias(inode, dentry); 1699 d = d_splice_alias(inode, dentry);
1705 if (IS_ERR(d)) {
1706 error = PTR_ERR(d);
1707 goto out_error;
1708 }
1709 dput(d);
1710out: 1700out:
1711 dput(parent); 1701 dput(parent);
1712 return 0; 1702 return d;
1713out_error: 1703out_error:
1714 nfs_mark_for_revalidate(dir); 1704 nfs_mark_for_revalidate(dir);
1715 dput(parent); 1705 d = ERR_PTR(error);
1716 return error; 1706 goto out;
1707}
1708EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
1709
1710/*
1711 * Code common to create, mkdir, and mknod.
1712 */
1713int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1714 struct nfs_fattr *fattr,
1715 struct nfs4_label *label)
1716{
1717 struct dentry *d;
1718
1719 d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
1720 if (IS_ERR(d))
1721 return PTR_ERR(d);
1722
1723 /* Callers don't care */
1724 dput(d);
1725 return 0;
1717} 1726}
1718EXPORT_SYMBOL_GPL(nfs_instantiate); 1727EXPORT_SYMBOL_GPL(nfs_instantiate);
1719 1728
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3cb073c50fa6..c9b605f6c9cb 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1164 .id = LAYOUT_NFSV4_1_FILES, 1164 .id = LAYOUT_NFSV4_1_FILES,
1165 .name = "LAYOUT_NFSV4_1_FILES", 1165 .name = "LAYOUT_NFSV4_1_FILES",
1166 .owner = THIS_MODULE, 1166 .owner = THIS_MODULE,
1167 .flags = PNFS_LAYOUTGET_ON_OPEN,
1167 .max_layoutget_response = 4096, /* 1 page or so... */ 1168 .max_layoutget_response = 4096, /* 1 page or so... */
1168 .alloc_layout_hdr = filelayout_alloc_layout_hdr, 1169 .alloc_layout_hdr = filelayout_alloc_layout_hdr,
1169 .free_layout_hdr = filelayout_free_layout_hdr, 1170 .free_layout_hdr = filelayout_free_layout_hdr,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e64f810223be..447a3c17fa8e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops;
16 16
17struct nfs_string; 17struct nfs_string;
18 18
19/* Maximum number of readahead requests
20 * FIXME: this should really be a sysctl so that users may tune it to suit
21 * their needs. People that do NFS over a slow network, might for
22 * instance want to reduce it to something closer to 1 for improved
23 * interactive response.
24 */
25#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
26
27static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) 19static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
28{ 20{
29 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) 21 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a3ad2d46fd42..9eb2f1a503ab 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void)
279 return data; 279 return data;
280} 280}
281 281
282static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) 282static struct dentry *
283nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
283{ 284{
284 int status; 285 int status;
285 286
286 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); 287 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
287 nfs_post_op_update_inode(dir, data->res.dir_attr); 288 nfs_post_op_update_inode(dir, data->res.dir_attr);
288 if (status == 0) 289 if (status != 0)
289 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); 290 return ERR_PTR(status);
290 return status; 291
292 return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
291} 293}
292 294
293static void nfs3_free_createdata(struct nfs3_createdata *data) 295static void nfs3_free_createdata(struct nfs3_createdata *data)
@@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
304{ 306{
305 struct posix_acl *default_acl, *acl; 307 struct posix_acl *default_acl, *acl;
306 struct nfs3_createdata *data; 308 struct nfs3_createdata *data;
309 struct dentry *d_alias;
307 int status = -ENOMEM; 310 int status = -ENOMEM;
308 311
309 dprintk("NFS call create %pd\n", dentry); 312 dprintk("NFS call create %pd\n", dentry);
@@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
330 goto out; 333 goto out;
331 334
332 for (;;) { 335 for (;;) {
333 status = nfs3_do_create(dir, dentry, data); 336 d_alias = nfs3_do_create(dir, dentry, data);
337 status = PTR_ERR_OR_ZERO(d_alias);
334 338
335 if (status != -ENOTSUPP) 339 if (status != -ENOTSUPP)
336 break; 340 break;
@@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
355 if (status != 0) 359 if (status != 0)
356 goto out_release_acls; 360 goto out_release_acls;
357 361
362 if (d_alias)
363 dentry = d_alias;
364
358 /* When we created the file with exclusive semantics, make 365 /* When we created the file with exclusive semantics, make
359 * sure we set the attributes afterwards. */ 366 * sure we set the attributes afterwards. */
360 if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { 367 if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
372 nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); 379 nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
373 dprintk("NFS reply setattr (post-create): %d\n", status); 380 dprintk("NFS reply setattr (post-create): %d\n", status);
374 if (status != 0) 381 if (status != 0)
375 goto out_release_acls; 382 goto out_dput;
376 } 383 }
377 384
378 status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); 385 status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
379 386
387out_dput:
388 dput(d_alias);
380out_release_acls: 389out_release_acls:
381 posix_acl_release(acl); 390 posix_acl_release(acl);
382 posix_acl_release(default_acl); 391 posix_acl_release(default_acl);
@@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
504 unsigned int len, struct iattr *sattr) 513 unsigned int len, struct iattr *sattr)
505{ 514{
506 struct nfs3_createdata *data; 515 struct nfs3_createdata *data;
516 struct dentry *d_alias;
507 int status = -ENOMEM; 517 int status = -ENOMEM;
508 518
509 if (len > NFS3_MAXPATHLEN) 519 if (len > NFS3_MAXPATHLEN)
@@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
522 data->arg.symlink.pathlen = len; 532 data->arg.symlink.pathlen = len;
523 data->arg.symlink.sattr = sattr; 533 data->arg.symlink.sattr = sattr;
524 534
525 status = nfs3_do_create(dir, dentry, data); 535 d_alias = nfs3_do_create(dir, dentry, data);
536 status = PTR_ERR_OR_ZERO(d_alias);
537
538 if (status == 0)
539 dput(d_alias);
526 540
527 nfs3_free_createdata(data); 541 nfs3_free_createdata(data);
528out: 542out:
@@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
535{ 549{
536 struct posix_acl *default_acl, *acl; 550 struct posix_acl *default_acl, *acl;
537 struct nfs3_createdata *data; 551 struct nfs3_createdata *data;
552 struct dentry *d_alias;
538 int status = -ENOMEM; 553 int status = -ENOMEM;
539 554
540 dprintk("NFS call mkdir %pd\n", dentry); 555 dprintk("NFS call mkdir %pd\n", dentry);
@@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
553 data->arg.mkdir.len = dentry->d_name.len; 568 data->arg.mkdir.len = dentry->d_name.len;
554 data->arg.mkdir.sattr = sattr; 569 data->arg.mkdir.sattr = sattr;
555 570
556 status = nfs3_do_create(dir, dentry, data); 571 d_alias = nfs3_do_create(dir, dentry, data);
572 status = PTR_ERR_OR_ZERO(d_alias);
573
557 if (status != 0) 574 if (status != 0)
558 goto out_release_acls; 575 goto out_release_acls;
559 576
577 if (d_alias)
578 dentry = d_alias;
579
560 status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); 580 status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
561 581
582 dput(d_alias);
562out_release_acls: 583out_release_acls:
563 posix_acl_release(acl); 584 posix_acl_release(acl);
564 posix_acl_release(default_acl); 585 posix_acl_release(default_acl);
@@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
660{ 681{
661 struct posix_acl *default_acl, *acl; 682 struct posix_acl *default_acl, *acl;
662 struct nfs3_createdata *data; 683 struct nfs3_createdata *data;
684 struct dentry *d_alias;
663 int status = -ENOMEM; 685 int status = -ENOMEM;
664 686
665 dprintk("NFS call mknod %pd %u:%u\n", dentry, 687 dprintk("NFS call mknod %pd %u:%u\n", dentry,
@@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
698 goto out; 720 goto out;
699 } 721 }
700 722
701 status = nfs3_do_create(dir, dentry, data); 723 d_alias = nfs3_do_create(dir, dentry, data);
724 status = PTR_ERR_OR_ZERO(d_alias);
702 if (status != 0) 725 if (status != 0)
703 goto out_release_acls; 726 goto out_release_acls;
704 727
728 if (d_alias)
729 dentry = d_alias;
730
705 status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); 731 status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
706 732
733 dput(d_alias);
707out_release_acls: 734out_release_acls:
708 posix_acl_release(acl); 735 posix_acl_release(acl);
709 posix_acl_release(default_acl); 736 posix_acl_release(default_acl);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3564da1ba8a1..16b2e5cc3e94 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
491extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, 491extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
492 const struct nfs_lock_context *, nfs4_stateid *, 492 const struct nfs_lock_context *, nfs4_stateid *,
493 const struct cred **); 493 const struct cred **);
494extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
495 struct nfs4_state *state);
496extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, 494extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
497 struct nfs4_state *state); 495 struct nfs4_state *state);
498 496
@@ -574,6 +572,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat
574 return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; 572 return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
575} 573}
576 574
575static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1)
576{
577 u32 seqid = be32_to_cpu(s1->seqid);
578
579 if (++seqid == 0)
580 ++seqid;
581 s1->seqid = cpu_to_be32(seqid);
582}
583
577static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) 584static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
578{ 585{
579 return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; 586 return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1406858bae6c..11eafcfc490b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1073,14 +1073,26 @@ static const struct rpc_call_ops nfs40_call_sync_ops = {
1073 .rpc_call_done = nfs40_call_sync_done, 1073 .rpc_call_done = nfs40_call_sync_done,
1074}; 1074};
1075 1075
1076static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup)
1077{
1078 int ret;
1079 struct rpc_task *task;
1080
1081 task = rpc_run_task(task_setup);
1082 if (IS_ERR(task))
1083 return PTR_ERR(task);
1084
1085 ret = task->tk_status;
1086 rpc_put_task(task);
1087 return ret;
1088}
1089
1076static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, 1090static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
1077 struct nfs_server *server, 1091 struct nfs_server *server,
1078 struct rpc_message *msg, 1092 struct rpc_message *msg,
1079 struct nfs4_sequence_args *args, 1093 struct nfs4_sequence_args *args,
1080 struct nfs4_sequence_res *res) 1094 struct nfs4_sequence_res *res)
1081{ 1095{
1082 int ret;
1083 struct rpc_task *task;
1084 struct nfs_client *clp = server->nfs_client; 1096 struct nfs_client *clp = server->nfs_client;
1085 struct nfs4_call_sync_data data = { 1097 struct nfs4_call_sync_data data = {
1086 .seq_server = server, 1098 .seq_server = server,
@@ -1094,14 +1106,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
1094 .callback_data = &data 1106 .callback_data = &data
1095 }; 1107 };
1096 1108
1097 task = rpc_run_task(&task_setup); 1109 return nfs4_call_sync_custom(&task_setup);
1098 if (IS_ERR(task))
1099 ret = PTR_ERR(task);
1100 else {
1101 ret = task->tk_status;
1102 rpc_put_task(task);
1103 }
1104 return ret;
1105} 1110}
1106 1111
1107int nfs4_call_sync(struct rpc_clnt *clnt, 1112int nfs4_call_sync(struct rpc_clnt *clnt,
@@ -3308,6 +3313,75 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task)
3308 return pnfs_wait_on_layoutreturn(inode, task); 3313 return pnfs_wait_on_layoutreturn(inode, task);
3309} 3314}
3310 3315
3316/*
3317 * Update the seqid of an open stateid
3318 */
3319static void nfs4_sync_open_stateid(nfs4_stateid *dst,
3320 struct nfs4_state *state)
3321{
3322 __be32 seqid_open;
3323 u32 dst_seqid;
3324 int seq;
3325
3326 for (;;) {
3327 if (!nfs4_valid_open_stateid(state))
3328 break;
3329 seq = read_seqbegin(&state->seqlock);
3330 if (!nfs4_state_match_open_stateid_other(state, dst)) {
3331 nfs4_stateid_copy(dst, &state->open_stateid);
3332 if (read_seqretry(&state->seqlock, seq))
3333 continue;
3334 break;
3335 }
3336 seqid_open = state->open_stateid.seqid;
3337 if (read_seqretry(&state->seqlock, seq))
3338 continue;
3339
3340 dst_seqid = be32_to_cpu(dst->seqid);
3341 if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0)
3342 dst->seqid = seqid_open;
3343 break;
3344 }
3345}
3346
3347/*
3348 * Update the seqid of an open stateid after receiving
3349 * NFS4ERR_OLD_STATEID
3350 */
3351static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
3352 struct nfs4_state *state)
3353{
3354 __be32 seqid_open;
3355 u32 dst_seqid;
3356 bool ret;
3357 int seq;
3358
3359 for (;;) {
3360 ret = false;
3361 if (!nfs4_valid_open_stateid(state))
3362 break;
3363 seq = read_seqbegin(&state->seqlock);
3364 if (!nfs4_state_match_open_stateid_other(state, dst)) {
3365 if (read_seqretry(&state->seqlock, seq))
3366 continue;
3367 break;
3368 }
3369 seqid_open = state->open_stateid.seqid;
3370 if (read_seqretry(&state->seqlock, seq))
3371 continue;
3372
3373 dst_seqid = be32_to_cpu(dst->seqid);
3374 if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0)
3375 dst->seqid = cpu_to_be32(dst_seqid + 1);
3376 else
3377 dst->seqid = seqid_open;
3378 ret = true;
3379 break;
3380 }
3381
3382 return ret;
3383}
3384
3311struct nfs4_closedata { 3385struct nfs4_closedata {
3312 struct inode *inode; 3386 struct inode *inode;
3313 struct nfs4_state *state; 3387 struct nfs4_state *state;
@@ -3358,32 +3432,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
3358 trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); 3432 trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
3359 3433
3360 /* Handle Layoutreturn errors */ 3434 /* Handle Layoutreturn errors */
3361 if (calldata->arg.lr_args && task->tk_status != 0) { 3435 if (pnfs_roc_done(task, calldata->inode,
3362 switch (calldata->res.lr_ret) { 3436 &calldata->arg.lr_args,
3363 default: 3437 &calldata->res.lr_res,
3364 calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; 3438 &calldata->res.lr_ret) == -EAGAIN)
3365 break; 3439 goto out_restart;
3366 case 0:
3367 calldata->arg.lr_args = NULL;
3368 calldata->res.lr_res = NULL;
3369 break;
3370 case -NFS4ERR_OLD_STATEID:
3371 if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid,
3372 &calldata->arg.lr_args->range,
3373 calldata->inode))
3374 goto lr_restart;
3375 /* Fallthrough */
3376 case -NFS4ERR_ADMIN_REVOKED:
3377 case -NFS4ERR_DELEG_REVOKED:
3378 case -NFS4ERR_EXPIRED:
3379 case -NFS4ERR_BAD_STATEID:
3380 case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
3381 case -NFS4ERR_WRONG_CRED:
3382 calldata->arg.lr_args = NULL;
3383 calldata->res.lr_res = NULL;
3384 goto lr_restart;
3385 }
3386 }
3387 3440
3388 /* hmm. we are done with the inode, and in the process of freeing 3441 /* hmm. we are done with the inode, and in the process of freeing
3389 * the state_owner. we keep this around to process errors 3442 * the state_owner. we keep this around to process errors
@@ -3403,7 +3456,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
3403 break; 3456 break;
3404 case -NFS4ERR_OLD_STATEID: 3457 case -NFS4ERR_OLD_STATEID:
3405 /* Did we race with OPEN? */ 3458 /* Did we race with OPEN? */
3406 if (nfs4_refresh_open_stateid(&calldata->arg.stateid, 3459 if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid,
3407 state)) 3460 state))
3408 goto out_restart; 3461 goto out_restart;
3409 goto out_release; 3462 goto out_release;
@@ -3415,7 +3468,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
3415 task->tk_msg.rpc_cred); 3468 task->tk_msg.rpc_cred);
3416 /* Fallthrough */ 3469 /* Fallthrough */
3417 case -NFS4ERR_BAD_STATEID: 3470 case -NFS4ERR_BAD_STATEID:
3418 break; 3471 if (calldata->arg.fmode == 0)
3472 break;
3473 /* Fallthrough */
3419 default: 3474 default:
3420 task->tk_status = nfs4_async_handle_exception(task, 3475 task->tk_status = nfs4_async_handle_exception(task,
3421 server, task->tk_status, &exception); 3476 server, task->tk_status, &exception);
@@ -3430,8 +3485,6 @@ out_release:
3430 nfs_refresh_inode(calldata->inode, &calldata->fattr); 3485 nfs_refresh_inode(calldata->inode, &calldata->fattr);
3431 dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); 3486 dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
3432 return; 3487 return;
3433lr_restart:
3434 calldata->res.lr_ret = 0;
3435out_restart: 3488out_restart:
3436 task->tk_status = 0; 3489 task->tk_status = 0;
3437 rpc_restart_call_prepare(task); 3490 rpc_restart_call_prepare(task);
@@ -3472,8 +3525,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
3472 } else if (is_rdwr) 3525 } else if (is_rdwr)
3473 calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; 3526 calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
3474 3527
3475 if (!nfs4_valid_open_stateid(state) || 3528 nfs4_sync_open_stateid(&calldata->arg.stateid, state);
3476 !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) 3529 if (!nfs4_valid_open_stateid(state))
3477 call_close = 0; 3530 call_close = 0;
3478 spin_unlock(&state->owner->so_lock); 3531 spin_unlock(&state->owner->so_lock);
3479 3532
@@ -6018,7 +6071,6 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
6018 .rpc_resp = res, 6071 .rpc_resp = res,
6019 .rpc_cred = cred, 6072 .rpc_cred = cred,
6020 }; 6073 };
6021 struct rpc_task *task;
6022 struct rpc_task_setup task_setup_data = { 6074 struct rpc_task_setup task_setup_data = {
6023 .rpc_client = clp->cl_rpcclient, 6075 .rpc_client = clp->cl_rpcclient,
6024 .rpc_message = &msg, 6076 .rpc_message = &msg,
@@ -6051,17 +6103,12 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
6051 dprintk("NFS call setclientid auth=%s, '%s'\n", 6103 dprintk("NFS call setclientid auth=%s, '%s'\n",
6052 clp->cl_rpcclient->cl_auth->au_ops->au_name, 6104 clp->cl_rpcclient->cl_auth->au_ops->au_name,
6053 clp->cl_owner_id); 6105 clp->cl_owner_id);
6054 task = rpc_run_task(&task_setup_data); 6106
6055 if (IS_ERR(task)) { 6107 status = nfs4_call_sync_custom(&task_setup_data);
6056 status = PTR_ERR(task);
6057 goto out;
6058 }
6059 status = task->tk_status;
6060 if (setclientid.sc_cred) { 6108 if (setclientid.sc_cred) {
6061 clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); 6109 clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
6062 put_rpccred(setclientid.sc_cred); 6110 put_rpccred(setclientid.sc_cred);
6063 } 6111 }
6064 rpc_put_task(task);
6065out: 6112out:
6066 trace_nfs4_setclientid(clp, status); 6113 trace_nfs4_setclientid(clp, status);
6067 dprintk("NFS reply setclientid: %d\n", status); 6114 dprintk("NFS reply setclientid: %d\n", status);
@@ -6129,32 +6176,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
6129 trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); 6176 trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
6130 6177
6131 /* Handle Layoutreturn errors */ 6178 /* Handle Layoutreturn errors */
6132 if (data->args.lr_args && task->tk_status != 0) { 6179 if (pnfs_roc_done(task, data->inode,
6133 switch(data->res.lr_ret) { 6180 &data->args.lr_args,
6134 default: 6181 &data->res.lr_res,
6135 data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; 6182 &data->res.lr_ret) == -EAGAIN)
6136 break; 6183 goto out_restart;
6137 case 0:
6138 data->args.lr_args = NULL;
6139 data->res.lr_res = NULL;
6140 break;
6141 case -NFS4ERR_OLD_STATEID:
6142 if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid,
6143 &data->args.lr_args->range,
6144 data->inode))
6145 goto lr_restart;
6146 /* Fallthrough */
6147 case -NFS4ERR_ADMIN_REVOKED:
6148 case -NFS4ERR_DELEG_REVOKED:
6149 case -NFS4ERR_EXPIRED:
6150 case -NFS4ERR_BAD_STATEID:
6151 case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
6152 case -NFS4ERR_WRONG_CRED:
6153 data->args.lr_args = NULL;
6154 data->res.lr_res = NULL;
6155 goto lr_restart;
6156 }
6157 }
6158 6184
6159 switch (task->tk_status) { 6185 switch (task->tk_status) {
6160 case 0: 6186 case 0:
@@ -6192,8 +6218,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
6192 } 6218 }
6193 data->rpc_status = task->tk_status; 6219 data->rpc_status = task->tk_status;
6194 return; 6220 return;
6195lr_restart:
6196 data->res.lr_ret = 0;
6197out_restart: 6221out_restart:
6198 task->tk_status = 0; 6222 task->tk_status = 0;
6199 rpc_restart_call_prepare(task); 6223 rpc_restart_call_prepare(task);
@@ -6386,6 +6410,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
6386 return err; 6410 return err;
6387} 6411}
6388 6412
6413/*
6414 * Update the seqid of a lock stateid after receiving
6415 * NFS4ERR_OLD_STATEID
6416 */
6417static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst,
6418 struct nfs4_lock_state *lsp)
6419{
6420 struct nfs4_state *state = lsp->ls_state;
6421 bool ret = false;
6422
6423 spin_lock(&state->state_lock);
6424 if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid))
6425 goto out;
6426 if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst))
6427 nfs4_stateid_seqid_inc(dst);
6428 else
6429 dst->seqid = lsp->ls_stateid.seqid;
6430 ret = true;
6431out:
6432 spin_unlock(&state->state_lock);
6433 return ret;
6434}
6435
6436static bool nfs4_sync_lock_stateid(nfs4_stateid *dst,
6437 struct nfs4_lock_state *lsp)
6438{
6439 struct nfs4_state *state = lsp->ls_state;
6440 bool ret;
6441
6442 spin_lock(&state->state_lock);
6443 ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid);
6444 nfs4_stateid_copy(dst, &lsp->ls_stateid);
6445 spin_unlock(&state->state_lock);
6446 return ret;
6447}
6448
6389struct nfs4_unlockdata { 6449struct nfs4_unlockdata {
6390 struct nfs_locku_args arg; 6450 struct nfs_locku_args arg;
6391 struct nfs_locku_res res; 6451 struct nfs_locku_res res;
@@ -6403,7 +6463,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
6403 struct nfs_seqid *seqid) 6463 struct nfs_seqid *seqid)
6404{ 6464{
6405 struct nfs4_unlockdata *p; 6465 struct nfs4_unlockdata *p;
6406 struct inode *inode = lsp->ls_state->inode; 6466 struct nfs4_state *state = lsp->ls_state;
6467 struct inode *inode = state->inode;
6407 6468
6408 p = kzalloc(sizeof(*p), GFP_NOFS); 6469 p = kzalloc(sizeof(*p), GFP_NOFS);
6409 if (p == NULL) 6470 if (p == NULL)
@@ -6419,6 +6480,9 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
6419 locks_init_lock(&p->fl); 6480 locks_init_lock(&p->fl);
6420 locks_copy_lock(&p->fl, fl); 6481 locks_copy_lock(&p->fl, fl);
6421 p->server = NFS_SERVER(inode); 6482 p->server = NFS_SERVER(inode);
6483 spin_lock(&state->state_lock);
6484 nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid);
6485 spin_unlock(&state->state_lock);
6422 return p; 6486 return p;
6423} 6487}
6424 6488
@@ -6457,10 +6521,14 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
6457 task->tk_msg.rpc_cred); 6521 task->tk_msg.rpc_cred);
6458 /* Fall through */ 6522 /* Fall through */
6459 case -NFS4ERR_BAD_STATEID: 6523 case -NFS4ERR_BAD_STATEID:
6460 case -NFS4ERR_OLD_STATEID:
6461 case -NFS4ERR_STALE_STATEID: 6524 case -NFS4ERR_STALE_STATEID:
6462 if (!nfs4_stateid_match(&calldata->arg.stateid, 6525 if (nfs4_sync_lock_stateid(&calldata->arg.stateid,
6463 &calldata->lsp->ls_stateid)) 6526 calldata->lsp))
6527 rpc_restart_call_prepare(task);
6528 break;
6529 case -NFS4ERR_OLD_STATEID:
6530 if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid,
6531 calldata->lsp))
6464 rpc_restart_call_prepare(task); 6532 rpc_restart_call_prepare(task);
6465 break; 6533 break;
6466 default: 6534 default:
@@ -6483,7 +6551,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
6483 6551
6484 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 6552 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
6485 goto out_wait; 6553 goto out_wait;
6486 nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
6487 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { 6554 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
6488 /* Note: exit _without_ running nfs4_locku_done */ 6555 /* Note: exit _without_ running nfs4_locku_done */
6489 goto out_no_action; 6556 goto out_no_action;
@@ -7645,6 +7712,8 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
7645static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) 7712static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
7646{ 7713{
7647 int status; 7714 int status;
7715 struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
7716 struct nfs_client *clp = NFS_SERVER(dir)->nfs_client;
7648 struct nfs4_secinfo_arg args = { 7717 struct nfs4_secinfo_arg args = {
7649 .dir_fh = NFS_FH(dir), 7718 .dir_fh = NFS_FH(dir),
7650 .name = name, 7719 .name = name,
@@ -7657,26 +7726,37 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
7657 .rpc_argp = &args, 7726 .rpc_argp = &args,
7658 .rpc_resp = &res, 7727 .rpc_resp = &res,
7659 }; 7728 };
7660 struct rpc_clnt *clnt = NFS_SERVER(dir)->client; 7729 struct nfs4_call_sync_data data = {
7730 .seq_server = NFS_SERVER(dir),
7731 .seq_args = &args.seq_args,
7732 .seq_res = &res.seq_res,
7733 };
7734 struct rpc_task_setup task_setup = {
7735 .rpc_client = clnt,
7736 .rpc_message = &msg,
7737 .callback_ops = clp->cl_mvops->call_sync_ops,
7738 .callback_data = &data,
7739 .flags = RPC_TASK_NO_ROUND_ROBIN,
7740 };
7661 const struct cred *cred = NULL; 7741 const struct cred *cred = NULL;
7662 7742
7663 if (use_integrity) { 7743 if (use_integrity) {
7664 clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; 7744 clnt = clp->cl_rpcclient;
7665 cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); 7745 task_setup.rpc_client = clnt;
7746
7747 cred = nfs4_get_clid_cred(clp);
7666 msg.rpc_cred = cred; 7748 msg.rpc_cred = cred;
7667 } 7749 }
7668 7750
7669 dprintk("NFS call secinfo %s\n", name->name); 7751 dprintk("NFS call secinfo %s\n", name->name);
7670 7752
7671 nfs4_state_protect(NFS_SERVER(dir)->nfs_client, 7753 nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
7672 NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); 7754 nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
7755 status = nfs4_call_sync_custom(&task_setup);
7673 7756
7674 status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
7675 &res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
7676 dprintk("NFS reply secinfo: %d\n", status); 7757 dprintk("NFS reply secinfo: %d\n", status);
7677 7758
7678 put_cred(cred); 7759 put_cred(cred);
7679
7680 return status; 7760 return status;
7681} 7761}
7682 7762
@@ -8344,7 +8424,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = {
8344 8424
8345int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) 8425int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
8346{ 8426{
8347 struct rpc_task *task;
8348 struct nfs4_get_lease_time_args args; 8427 struct nfs4_get_lease_time_args args;
8349 struct nfs4_get_lease_time_res res = { 8428 struct nfs4_get_lease_time_res res = {
8350 .lr_fsinfo = fsinfo, 8429 .lr_fsinfo = fsinfo,
@@ -8366,17 +8445,9 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
8366 .callback_data = &data, 8445 .callback_data = &data,
8367 .flags = RPC_TASK_TIMEOUT, 8446 .flags = RPC_TASK_TIMEOUT,
8368 }; 8447 };
8369 int status;
8370 8448
8371 nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); 8449 nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1);
8372 task = rpc_run_task(&task_setup); 8450 return nfs4_call_sync_custom(&task_setup);
8373
8374 if (IS_ERR(task))
8375 return PTR_ERR(task);
8376
8377 status = task->tk_status;
8378 rpc_put_task(task);
8379 return status;
8380} 8451}
8381 8452
8382#ifdef CONFIG_NFS_V4_1 8453#ifdef CONFIG_NFS_V4_1
@@ -8845,7 +8916,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
8845 const struct cred *cred) 8916 const struct cred *cred)
8846{ 8917{
8847 struct nfs4_reclaim_complete_data *calldata; 8918 struct nfs4_reclaim_complete_data *calldata;
8848 struct rpc_task *task;
8849 struct rpc_message msg = { 8919 struct rpc_message msg = {
8850 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], 8920 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
8851 .rpc_cred = cred, 8921 .rpc_cred = cred,
@@ -8854,7 +8924,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
8854 .rpc_client = clp->cl_rpcclient, 8924 .rpc_client = clp->cl_rpcclient,
8855 .rpc_message = &msg, 8925 .rpc_message = &msg,
8856 .callback_ops = &nfs4_reclaim_complete_call_ops, 8926 .callback_ops = &nfs4_reclaim_complete_call_ops,
8857 .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, 8927 .flags = RPC_TASK_NO_ROUND_ROBIN,
8858 }; 8928 };
8859 int status = -ENOMEM; 8929 int status = -ENOMEM;
8860 8930
@@ -8869,15 +8939,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
8869 msg.rpc_argp = &calldata->arg; 8939 msg.rpc_argp = &calldata->arg;
8870 msg.rpc_resp = &calldata->res; 8940 msg.rpc_resp = &calldata->res;
8871 task_setup_data.callback_data = calldata; 8941 task_setup_data.callback_data = calldata;
8872 task = rpc_run_task(&task_setup_data); 8942 status = nfs4_call_sync_custom(&task_setup_data);
8873 if (IS_ERR(task)) {
8874 status = PTR_ERR(task);
8875 goto out;
8876 }
8877 status = rpc_wait_for_completion_task(task);
8878 if (status == 0)
8879 status = task->tk_status;
8880 rpc_put_task(task);
8881out: 8943out:
8882 dprintk("<-- %s status=%d\n", __func__, status); 8944 dprintk("<-- %s status=%d\n", __func__, status);
8883 return status; 8945 return status;
@@ -9103,10 +9165,19 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
9103 if (!nfs41_sequence_process(task, &lrp->res.seq_res)) 9165 if (!nfs41_sequence_process(task, &lrp->res.seq_res))
9104 return; 9166 return;
9105 9167
9168 /*
9169 * Was there an RPC level error? Assume the call succeeded,
9170 * and that we need to release the layout
9171 */
9172 if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) {
9173 lrp->res.lrs_present = 0;
9174 return;
9175 }
9176
9106 server = NFS_SERVER(lrp->args.inode); 9177 server = NFS_SERVER(lrp->args.inode);
9107 switch (task->tk_status) { 9178 switch (task->tk_status) {
9108 case -NFS4ERR_OLD_STATEID: 9179 case -NFS4ERR_OLD_STATEID:
9109 if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid, 9180 if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid,
9110 &lrp->args.range, 9181 &lrp->args.range,
9111 lrp->args.inode)) 9182 lrp->args.inode))
9112 goto out_restart; 9183 goto out_restart;
@@ -9362,18 +9433,32 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
9362 .rpc_resp = &res, 9433 .rpc_resp = &res,
9363 }; 9434 };
9364 struct rpc_clnt *clnt = server->client; 9435 struct rpc_clnt *clnt = server->client;
9436 struct nfs4_call_sync_data data = {
9437 .seq_server = server,
9438 .seq_args = &args.seq_args,
9439 .seq_res = &res.seq_res,
9440 };
9441 struct rpc_task_setup task_setup = {
9442 .rpc_client = server->client,
9443 .rpc_message = &msg,
9444 .callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
9445 .callback_data = &data,
9446 .flags = RPC_TASK_NO_ROUND_ROBIN,
9447 };
9365 const struct cred *cred = NULL; 9448 const struct cred *cred = NULL;
9366 int status; 9449 int status;
9367 9450
9368 if (use_integrity) { 9451 if (use_integrity) {
9369 clnt = server->nfs_client->cl_rpcclient; 9452 clnt = server->nfs_client->cl_rpcclient;
9453 task_setup.rpc_client = clnt;
9454
9370 cred = nfs4_get_clid_cred(server->nfs_client); 9455 cred = nfs4_get_clid_cred(server->nfs_client);
9371 msg.rpc_cred = cred; 9456 msg.rpc_cred = cred;
9372 } 9457 }
9373 9458
9374 dprintk("--> %s\n", __func__); 9459 dprintk("--> %s\n", __func__);
9375 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, 9460 nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
9376 &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); 9461 status = nfs4_call_sync_custom(&task_setup);
9377 dprintk("<-- %s status=%d\n", __func__, status); 9462 dprintk("<-- %s status=%d\n", __func__, status);
9378 9463
9379 put_cred(cred); 9464 put_cred(cred);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cad4e064b328..0c6d53dc3672 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1015,22 +1015,6 @@ out:
1015 return ret; 1015 return ret;
1016} 1016}
1017 1017
1018bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
1019{
1020 bool ret;
1021 int seq;
1022
1023 do {
1024 ret = false;
1025 seq = read_seqbegin(&state->seqlock);
1026 if (nfs4_state_match_open_stateid_other(state, dst)) {
1027 dst->seqid = state->open_stateid.seqid;
1028 ret = true;
1029 }
1030 } while (read_seqretry(&state->seqlock, seq));
1031 return ret;
1032}
1033
1034bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) 1018bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
1035{ 1019{
1036 bool ret; 1020 bool ret;
@@ -2095,8 +2079,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
2095 } 2079 }
2096 2080
2097 status = nfs4_begin_drain_session(clp); 2081 status = nfs4_begin_drain_session(clp);
2098 if (status != 0) 2082 if (status != 0) {
2099 return status; 2083 result = status;
2084 goto out;
2085 }
2100 2086
2101 status = nfs4_replace_transport(server, locations); 2087 status = nfs4_replace_transport(server, locations);
2102 if (status != 0) { 2088 if (status != 0) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 46a8d636d151..ab07db0f07cd 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
1174 } else 1174 } else
1175 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); 1175 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1176 } 1176 }
1177 if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { 1177 if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
1178 *p++ = cpu_to_be32(label->lfs); 1178 *p++ = cpu_to_be32(label->lfs);
1179 *p++ = cpu_to_be32(label->pi); 1179 *p++ = cpu_to_be32(label->pi);
1180 *p++ = cpu_to_be32(label->len); 1180 *p++ = cpu_to_be32(label->len);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4525d5acae38..bb80034a7661 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
359} 359}
360 360
361/* 361/*
362 * Update the seqid of a layout stateid 362 * Update the seqid of a layout stateid after receiving
363 * NFS4ERR_OLD_STATEID
363 */ 364 */
364bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, 365bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
365 struct pnfs_layout_range *dst_range, 366 struct pnfs_layout_range *dst_range,
366 struct inode *inode) 367 struct inode *inode)
367{ 368{
@@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
377 378
378 spin_lock(&inode->i_lock); 379 spin_lock(&inode->i_lock);
379 lo = NFS_I(inode)->layout; 380 lo = NFS_I(inode)->layout;
380 if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { 381 if (lo && pnfs_layout_is_valid(lo) &&
382 nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
383 /* Is our call using the most recent seqid? If so, bump it */
384 if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
385 nfs4_stateid_seqid_inc(dst);
386 ret = true;
387 goto out;
388 }
389 /* Try to update the seqid to the most recent */
381 err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); 390 err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
382 if (err != -EBUSY) { 391 if (err != -EBUSY) {
383 dst->seqid = lo->plh_stateid.seqid; 392 dst->seqid = lo->plh_stateid.seqid;
@@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
385 ret = true; 394 ret = true;
386 } 395 }
387 } 396 }
397out:
388 spin_unlock(&inode->i_lock); 398 spin_unlock(&inode->i_lock);
389 pnfs_free_lseg_list(&head); 399 pnfs_free_lseg_list(&head);
390 return ret; 400 return ret;
@@ -1440,6 +1450,52 @@ out_noroc:
1440 return false; 1450 return false;
1441} 1451}
1442 1452
1453int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
1454 struct nfs4_layoutreturn_args **argpp,
1455 struct nfs4_layoutreturn_res **respp,
1456 int *ret)
1457{
1458 struct nfs4_layoutreturn_args *arg = *argpp;
1459 int retval = -EAGAIN;
1460
1461 if (!arg)
1462 return 0;
1463 /* Handle Layoutreturn errors */
1464 switch (*ret) {
1465 case 0:
1466 retval = 0;
1467 break;
1468 case -NFS4ERR_NOMATCHING_LAYOUT:
1469 /* Was there an RPC level error? If not, retry */
1470 if (task->tk_rpc_status == 0)
1471 break;
1472 /* If the call was not sent, let caller handle it */
1473 if (!RPC_WAS_SENT(task))
1474 return 0;
1475 /*
1476 * Otherwise, assume the call succeeded and
1477 * that we need to release the layout
1478 */
1479 *ret = 0;
1480 (*respp)->lrs_present = 0;
1481 retval = 0;
1482 break;
1483 case -NFS4ERR_DELAY:
1484 /* Let the caller handle the retry */
1485 *ret = -NFS4ERR_NOMATCHING_LAYOUT;
1486 return 0;
1487 case -NFS4ERR_OLD_STATEID:
1488 if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
1489 &arg->range, inode))
1490 break;
1491 *ret = -NFS4ERR_NOMATCHING_LAYOUT;
1492 return -EAGAIN;
1493 }
1494 *argpp = NULL;
1495 *respp = NULL;
1496 return retval;
1497}
1498
1443void pnfs_roc_release(struct nfs4_layoutreturn_args *args, 1499void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
1444 struct nfs4_layoutreturn_res *res, 1500 struct nfs4_layoutreturn_res *res,
1445 int ret) 1501 int ret)
@@ -1449,10 +1505,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
1449 const nfs4_stateid *res_stateid = NULL; 1505 const nfs4_stateid *res_stateid = NULL;
1450 struct nfs4_xdr_opaque_data *ld_private = args->ld_private; 1506 struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
1451 1507
1452 if (ret == 0) { 1508 switch (ret) {
1453 arg_stateid = &args->stateid; 1509 case -NFS4ERR_NOMATCHING_LAYOUT:
1510 break;
1511 case 0:
1454 if (res->lrs_present) 1512 if (res->lrs_present)
1455 res_stateid = &res->stateid; 1513 res_stateid = &res->stateid;
1514 /* Fallthrough */
1515 default:
1516 arg_stateid = &args->stateid;
1456 } 1517 }
1457 pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, 1518 pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
1458 res_stateid); 1519 res_stateid);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f15609c003d8..f8a38065c7e4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
261 bool is_recall); 261 bool is_recall);
262int pnfs_destroy_layouts_byclid(struct nfs_client *clp, 262int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
263 bool is_recall); 263 bool is_recall);
264bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, 264bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
265 struct pnfs_layout_range *dst_range, 265 struct pnfs_layout_range *dst_range,
266 struct inode *inode); 266 struct inode *inode);
267void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); 267void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino,
282 struct nfs4_layoutreturn_args *args, 282 struct nfs4_layoutreturn_args *args,
283 struct nfs4_layoutreturn_res *res, 283 struct nfs4_layoutreturn_res *res,
284 const struct cred *cred); 284 const struct cred *cred);
285int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
286 struct nfs4_layoutreturn_args **argpp,
287 struct nfs4_layoutreturn_res **respp,
288 int *ret);
285void pnfs_roc_release(struct nfs4_layoutreturn_args *args, 289void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
286 struct nfs4_layoutreturn_res *res, 290 struct nfs4_layoutreturn_res *res,
287 int ret); 291 int ret);
@@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino,
701 return false; 705 return false;
702} 706}
703 707
708static inline int
709pnfs_roc_done(struct rpc_task *task, struct inode *inode,
710 struct nfs4_layoutreturn_args **argpp,
711 struct nfs4_layoutreturn_res **respp,
712 int *ret)
713{
714 return 0;
715}
716
704static inline void 717static inline void
705pnfs_roc_release(struct nfs4_layoutreturn_args *args, 718pnfs_roc_release(struct nfs4_layoutreturn_args *args,
706 struct nfs4_layoutreturn_res *res, 719 struct nfs4_layoutreturn_res *res,
@@ -785,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
785{ 798{
786} 799}
787 800
788static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, 801static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
789 struct pnfs_layout_range *dst_range, 802 struct pnfs_layout_range *dst_range,
790 struct inode *inode) 803 struct inode *inode)
791{ 804{
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 19a76cfa8b1f..a84df7d63403 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2645,6 +2645,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
2645} 2645}
2646EXPORT_SYMBOL_GPL(nfs_clone_sb_security); 2646EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
2647 2647
2648static void nfs_set_readahead(struct backing_dev_info *bdi,
2649 unsigned long iomax_pages)
2650{
2651 bdi->ra_pages = VM_READAHEAD_PAGES;
2652 bdi->io_pages = iomax_pages;
2653}
2654
2648struct dentry *nfs_fs_mount_common(struct nfs_server *server, 2655struct dentry *nfs_fs_mount_common(struct nfs_server *server,
2649 int flags, const char *dev_name, 2656 int flags, const char *dev_name,
2650 struct nfs_mount_info *mount_info, 2657 struct nfs_mount_info *mount_info,
@@ -2687,7 +2694,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
2687 mntroot = ERR_PTR(error); 2694 mntroot = ERR_PTR(error);
2688 goto error_splat_super; 2695 goto error_splat_super;
2689 } 2696 }
2690 s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; 2697 nfs_set_readahead(s->s_bdi, server->rpages);
2691 server->super = s; 2698 server->super = s;
2692 } 2699 }
2693 2700
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0a11712a80e3..570a60c2f4f4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -490,6 +490,9 @@ extern const struct file_operations nfs_dir_operations;
490extern const struct dentry_operations nfs_dentry_operations; 490extern const struct dentry_operations nfs_dentry_operations;
491 491
492extern void nfs_force_lookup_revalidate(struct inode *dir); 492extern void nfs_force_lookup_revalidate(struct inode *dir);
493extern struct dentry *nfs_add_or_obtain(struct dentry *dentry,
494 struct nfs_fh *fh, struct nfs_fattr *fattr,
495 struct nfs4_label *label);
493extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, 496extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
494 struct nfs_fattr *fattr, struct nfs4_label *label); 497 struct nfs_fattr *fattr, struct nfs4_label *label);
495extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); 498extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 27536b961552..a6ef35184ef1 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -242,9 +242,6 @@ void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *queue,
242void rpc_sleep_on_priority(struct rpc_wait_queue *, 242void rpc_sleep_on_priority(struct rpc_wait_queue *,
243 struct rpc_task *, 243 struct rpc_task *,
244 int priority); 244 int priority);
245void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
246 struct rpc_wait_queue *queue,
247 struct rpc_task *task);
248void rpc_wake_up_queued_task(struct rpc_wait_queue *, 245void rpc_wake_up_queued_task(struct rpc_wait_queue *,
249 struct rpc_task *); 246 struct rpc_task *);
250void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *, 247void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *,
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8a87d8bcb197..f33e5013bdfb 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -186,7 +186,7 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
186extern void xdr_shift_buf(struct xdr_buf *, size_t); 186extern void xdr_shift_buf(struct xdr_buf *, size_t);
187extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); 187extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
188extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); 188extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
189extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int); 189extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int);
190extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); 190extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
191extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); 191extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
192 192
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 13e108bcc9eb..d783e15ba898 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -352,6 +352,7 @@ bool xprt_prepare_transmit(struct rpc_task *task);
352void xprt_request_enqueue_transmit(struct rpc_task *task); 352void xprt_request_enqueue_transmit(struct rpc_task *task);
353void xprt_request_enqueue_receive(struct rpc_task *task); 353void xprt_request_enqueue_receive(struct rpc_task *task);
354void xprt_request_wait_receive(struct rpc_task *task); 354void xprt_request_wait_receive(struct rpc_task *task);
355void xprt_request_dequeue_xprt(struct rpc_task *task);
355bool xprt_request_need_retransmit(struct rpc_task *task); 356bool xprt_request_need_retransmit(struct rpc_task *task);
356void xprt_transmit(struct rpc_task *task); 357void xprt_transmit(struct rpc_task *task);
357void xprt_end_transmit(struct rpc_task *task); 358void xprt_end_transmit(struct rpc_task *task);
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 86fc38ff0355..16c239e0d6dd 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -49,9 +49,9 @@
49 * fully-chunked NFS message (read chunks are the largest). Note only 49 * fully-chunked NFS message (read chunks are the largest). Note only
50 * a single chunk type per message is supported currently. 50 * a single chunk type per message is supported currently.
51 */ 51 */
52#define RPCRDMA_MIN_SLOT_TABLE (2U) 52#define RPCRDMA_MIN_SLOT_TABLE (4U)
53#define RPCRDMA_DEF_SLOT_TABLE (128U) 53#define RPCRDMA_DEF_SLOT_TABLE (128U)
54#define RPCRDMA_MAX_SLOT_TABLE (256U) 54#define RPCRDMA_MAX_SLOT_TABLE (16384U)
55 55
56#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */ 56#define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */
57#define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */ 57#define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index f6a4eaa85a3e..a13830616107 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -451,20 +451,81 @@ TRACE_EVENT(xprtrdma_createmrs,
451 451
452 TP_STRUCT__entry( 452 TP_STRUCT__entry(
453 __field(const void *, r_xprt) 453 __field(const void *, r_xprt)
454 __string(addr, rpcrdma_addrstr(r_xprt))
455 __string(port, rpcrdma_portstr(r_xprt))
454 __field(unsigned int, count) 456 __field(unsigned int, count)
455 ), 457 ),
456 458
457 TP_fast_assign( 459 TP_fast_assign(
458 __entry->r_xprt = r_xprt; 460 __entry->r_xprt = r_xprt;
459 __entry->count = count; 461 __entry->count = count;
462 __assign_str(addr, rpcrdma_addrstr(r_xprt));
463 __assign_str(port, rpcrdma_portstr(r_xprt));
460 ), 464 ),
461 465
462 TP_printk("r_xprt=%p: created %u MRs", 466 TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs",
463 __entry->r_xprt, __entry->count 467 __get_str(addr), __get_str(port), __entry->r_xprt,
468 __entry->count
464 ) 469 )
465); 470);
466 471
467DEFINE_RXPRT_EVENT(xprtrdma_nomrs); 472TRACE_EVENT(xprtrdma_mr_get,
473 TP_PROTO(
474 const struct rpcrdma_req *req
475 ),
476
477 TP_ARGS(req),
478
479 TP_STRUCT__entry(
480 __field(const void *, req)
481 __field(unsigned int, task_id)
482 __field(unsigned int, client_id)
483 __field(u32, xid)
484 ),
485
486 TP_fast_assign(
487 const struct rpc_rqst *rqst = &req->rl_slot;
488
489 __entry->req = req;
490 __entry->task_id = rqst->rq_task->tk_pid;
491 __entry->client_id = rqst->rq_task->tk_client->cl_clid;
492 __entry->xid = be32_to_cpu(rqst->rq_xid);
493 ),
494
495 TP_printk("task:%u@%u xid=0x%08x req=%p",
496 __entry->task_id, __entry->client_id, __entry->xid,
497 __entry->req
498 )
499);
500
501TRACE_EVENT(xprtrdma_nomrs,
502 TP_PROTO(
503 const struct rpcrdma_req *req
504 ),
505
506 TP_ARGS(req),
507
508 TP_STRUCT__entry(
509 __field(const void *, req)
510 __field(unsigned int, task_id)
511 __field(unsigned int, client_id)
512 __field(u32, xid)
513 ),
514
515 TP_fast_assign(
516 const struct rpc_rqst *rqst = &req->rl_slot;
517
518 __entry->req = req;
519 __entry->task_id = rqst->rq_task->tk_pid;
520 __entry->client_id = rqst->rq_task->tk_client->cl_clid;
521 __entry->xid = be32_to_cpu(rqst->rq_xid);
522 ),
523
524 TP_printk("task:%u@%u xid=0x%08x req=%p",
525 __entry->task_id, __entry->client_id, __entry->xid,
526 __entry->req
527 )
528);
468 529
469DEFINE_RDCH_EVENT(read); 530DEFINE_RDCH_EVENT(read);
470DEFINE_WRCH_EVENT(write); 531DEFINE_WRCH_EVENT(write);
@@ -623,21 +684,21 @@ TRACE_EVENT(xprtrdma_post_send,
623 684
624TRACE_EVENT(xprtrdma_post_recv, 685TRACE_EVENT(xprtrdma_post_recv,
625 TP_PROTO( 686 TP_PROTO(
626 const struct ib_cqe *cqe 687 const struct rpcrdma_rep *rep
627 ), 688 ),
628 689
629 TP_ARGS(cqe), 690 TP_ARGS(rep),
630 691
631 TP_STRUCT__entry( 692 TP_STRUCT__entry(
632 __field(const void *, cqe) 693 __field(const void *, rep)
633 ), 694 ),
634 695
635 TP_fast_assign( 696 TP_fast_assign(
636 __entry->cqe = cqe; 697 __entry->rep = rep;
637 ), 698 ),
638 699
639 TP_printk("cqe=%p", 700 TP_printk("rep=%p",
640 __entry->cqe 701 __entry->rep
641 ) 702 )
642); 703);
643 704
@@ -715,14 +776,15 @@ TRACE_EVENT(xprtrdma_wc_receive,
715 TP_ARGS(wc), 776 TP_ARGS(wc),
716 777
717 TP_STRUCT__entry( 778 TP_STRUCT__entry(
718 __field(const void *, cqe) 779 __field(const void *, rep)
719 __field(u32, byte_len) 780 __field(u32, byte_len)
720 __field(unsigned int, status) 781 __field(unsigned int, status)
721 __field(u32, vendor_err) 782 __field(u32, vendor_err)
722 ), 783 ),
723 784
724 TP_fast_assign( 785 TP_fast_assign(
725 __entry->cqe = wc->wr_cqe; 786 __entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep,
787 rr_cqe);
726 __entry->status = wc->status; 788 __entry->status = wc->status;
727 if (wc->status) { 789 if (wc->status) {
728 __entry->byte_len = 0; 790 __entry->byte_len = 0;
@@ -733,8 +795,8 @@ TRACE_EVENT(xprtrdma_wc_receive,
733 } 795 }
734 ), 796 ),
735 797
736 TP_printk("cqe=%p %u bytes: %s (%u/0x%x)", 798 TP_printk("rep=%p %u bytes: %s (%u/0x%x)",
737 __entry->cqe, __entry->byte_len, 799 __entry->rep, __entry->byte_len,
738 rdma_show_wc_status(__entry->status), 800 rdma_show_wc_status(__entry->status),
739 __entry->status, __entry->vendor_err 801 __entry->status, __entry->vendor_err
740 ) 802 )
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 4ce42c62458e..d75fddca44c9 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1960,7 +1960,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
1960 1960
1961 if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) 1961 if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
1962 goto unwrap_failed; 1962 goto unwrap_failed;
1963 if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) 1963 if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
1964 goto unwrap_failed; 1964 goto unwrap_failed;
1965 maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); 1965 maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
1966 if (maj_stat == GSS_S_CONTEXT_EXPIRED) 1966 if (maj_stat == GSS_S_CONTEXT_EXPIRED)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a07b516e503a..f7f78566be46 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1837,7 +1837,7 @@ call_allocate(struct rpc_task *task)
1837 return; 1837 return;
1838 } 1838 }
1839 1839
1840 rpc_exit(task, -ERESTARTSYS); 1840 rpc_call_rpcerror(task, -ERESTARTSYS);
1841} 1841}
1842 1842
1843static int 1843static int
@@ -1862,6 +1862,7 @@ rpc_xdr_encode(struct rpc_task *task)
1862 req->rq_rbuffer, 1862 req->rq_rbuffer,
1863 req->rq_rcvsize); 1863 req->rq_rcvsize);
1864 1864
1865 req->rq_reply_bytes_recvd = 0;
1865 req->rq_snd_buf.head[0].iov_len = 0; 1866 req->rq_snd_buf.head[0].iov_len = 0;
1866 xdr_init_encode(&xdr, &req->rq_snd_buf, 1867 xdr_init_encode(&xdr, &req->rq_snd_buf,
1867 req->rq_snd_buf.head[0].iov_base, req); 1868 req->rq_snd_buf.head[0].iov_base, req);
@@ -1881,6 +1882,8 @@ call_encode(struct rpc_task *task)
1881 if (!rpc_task_need_encode(task)) 1882 if (!rpc_task_need_encode(task))
1882 goto out; 1883 goto out;
1883 dprint_status(task); 1884 dprint_status(task);
1885 /* Dequeue task from the receive queue while we're encoding */
1886 xprt_request_dequeue_xprt(task);
1884 /* Encode here so that rpcsec_gss can use correct sequence number. */ 1887 /* Encode here so that rpcsec_gss can use correct sequence number. */
1885 rpc_xdr_encode(task); 1888 rpc_xdr_encode(task);
1886 /* Did the encode result in an error condition? */ 1889 /* Did the encode result in an error condition? */
@@ -2479,6 +2482,7 @@ call_decode(struct rpc_task *task)
2479 struct rpc_clnt *clnt = task->tk_client; 2482 struct rpc_clnt *clnt = task->tk_client;
2480 struct rpc_rqst *req = task->tk_rqstp; 2483 struct rpc_rqst *req = task->tk_rqstp;
2481 struct xdr_stream xdr; 2484 struct xdr_stream xdr;
2485 int err;
2482 2486
2483 dprint_status(task); 2487 dprint_status(task);
2484 2488
@@ -2501,6 +2505,15 @@ call_decode(struct rpc_task *task)
2501 * before it changed req->rq_reply_bytes_recvd. 2505 * before it changed req->rq_reply_bytes_recvd.
2502 */ 2506 */
2503 smp_rmb(); 2507 smp_rmb();
2508
2509 /*
2510 * Did we ever call xprt_complete_rqst()? If not, we should assume
2511 * the message is incomplete.
2512 */
2513 err = -EAGAIN;
2514 if (!req->rq_reply_bytes_recvd)
2515 goto out;
2516
2504 req->rq_rcv_buf.len = req->rq_private_buf.len; 2517 req->rq_rcv_buf.len = req->rq_private_buf.len;
2505 2518
2506 /* Check that the softirq receive buffer is valid */ 2519 /* Check that the softirq receive buffer is valid */
@@ -2509,7 +2522,9 @@ call_decode(struct rpc_task *task)
2509 2522
2510 xdr_init_decode(&xdr, &req->rq_rcv_buf, 2523 xdr_init_decode(&xdr, &req->rq_rcv_buf,
2511 req->rq_rcv_buf.head[0].iov_base, req); 2524 req->rq_rcv_buf.head[0].iov_base, req);
2512 switch (rpc_decode_header(task, &xdr)) { 2525 err = rpc_decode_header(task, &xdr);
2526out:
2527 switch (err) {
2513 case 0: 2528 case 0:
2514 task->tk_action = rpc_exit_task; 2529 task->tk_action = rpc_exit_task;
2515 task->tk_status = rpcauth_unwrap_resp(task, &xdr); 2530 task->tk_status = rpcauth_unwrap_resp(task, &xdr);
@@ -2518,9 +2533,6 @@ call_decode(struct rpc_task *task)
2518 return; 2533 return;
2519 case -EAGAIN: 2534 case -EAGAIN:
2520 task->tk_status = 0; 2535 task->tk_status = 0;
2521 xdr_free_bvec(&req->rq_rcv_buf);
2522 req->rq_reply_bytes_recvd = 0;
2523 req->rq_rcv_buf.len = 0;
2524 if (task->tk_client->cl_discrtry) 2536 if (task->tk_client->cl_discrtry)
2525 xprt_conditional_disconnect(req->rq_xprt, 2537 xprt_conditional_disconnect(req->rq_xprt,
2526 req->rq_connect_cookie); 2538 req->rq_connect_cookie);
@@ -2561,7 +2573,7 @@ rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr)
2561 return 0; 2573 return 0;
2562out_fail: 2574out_fail:
2563 trace_rpc_bad_callhdr(task); 2575 trace_rpc_bad_callhdr(task);
2564 rpc_exit(task, error); 2576 rpc_call_rpcerror(task, error);
2565 return error; 2577 return error;
2566} 2578}
2567 2579
@@ -2628,7 +2640,7 @@ out_garbage:
2628 return -EAGAIN; 2640 return -EAGAIN;
2629 } 2641 }
2630out_err: 2642out_err:
2631 rpc_exit(task, error); 2643 rpc_call_rpcerror(task, error);
2632 return error; 2644 return error;
2633 2645
2634out_unparsable: 2646out_unparsable:
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 1f275aba786f..360afe153193 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -541,33 +541,14 @@ rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq,
541 return NULL; 541 return NULL;
542} 542}
543 543
544static void
545rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
546 struct rpc_wait_queue *queue, struct rpc_task *task)
547{
548 rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL);
549}
550
551/* 544/*
552 * Wake up a queued task while the queue lock is being held 545 * Wake up a queued task while the queue lock is being held
553 */ 546 */
554static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) 547static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue,
555{ 548 struct rpc_task *task)
556 rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
557}
558
559/*
560 * Wake up a task on a specific queue
561 */
562void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
563 struct rpc_wait_queue *queue,
564 struct rpc_task *task)
565{ 549{
566 if (!RPC_IS_QUEUED(task)) 550 rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
567 return; 551 task, NULL, NULL);
568 spin_lock(&queue->lock);
569 rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
570 spin_unlock(&queue->lock);
571} 552}
572 553
573/* 554/*
@@ -930,8 +911,10 @@ static void __rpc_execute(struct rpc_task *task)
930 /* 911 /*
931 * Signalled tasks should exit rather than sleep. 912 * Signalled tasks should exit rather than sleep.
932 */ 913 */
933 if (RPC_SIGNALLED(task)) 914 if (RPC_SIGNALLED(task)) {
915 task->tk_rpc_status = -ERESTARTSYS;
934 rpc_exit(task, -ERESTARTSYS); 916 rpc_exit(task, -ERESTARTSYS);
917 }
935 918
936 /* 919 /*
937 * The queue->lock protects against races with 920 * The queue->lock protects against races with
@@ -967,6 +950,7 @@ static void __rpc_execute(struct rpc_task *task)
967 */ 950 */
968 dprintk("RPC: %5u got signal\n", task->tk_pid); 951 dprintk("RPC: %5u got signal\n", task->tk_pid);
969 set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); 952 set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
953 task->tk_rpc_status = -ERESTARTSYS;
970 rpc_exit(task, -ERESTARTSYS); 954 rpc_exit(task, -ERESTARTSYS);
971 } 955 }
972 dprintk("RPC: %5u sync task resuming\n", task->tk_pid); 956 dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 48c93b9e525e..14ba9e72a204 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -560,7 +560,7 @@ EXPORT_SYMBOL_GPL(xdr_init_encode);
560 * required at the end of encoding, or any other time when the xdr_buf 560 * required at the end of encoding, or any other time when the xdr_buf
561 * data might be read. 561 * data might be read.
562 */ 562 */
563void xdr_commit_encode(struct xdr_stream *xdr) 563inline void xdr_commit_encode(struct xdr_stream *xdr)
564{ 564{
565 int shift = xdr->scratch.iov_len; 565 int shift = xdr->scratch.iov_len;
566 void *page; 566 void *page;
@@ -1236,43 +1236,60 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
1236} 1236}
1237EXPORT_SYMBOL_GPL(xdr_encode_word); 1237EXPORT_SYMBOL_GPL(xdr_encode_word);
1238 1238
1239/* If the netobj starting offset bytes from the start of xdr_buf is contained 1239/**
1240 * entirely in the head or the tail, set object to point to it; otherwise 1240 * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
1241 * try to find space for it at the end of the tail, copy it there, and 1241 * @buf: pointer to buffer containing a mic
1242 * set obj to point to it. */ 1242 * @mic: on success, returns the address of the mic
1243int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) 1243 * @offset: the offset in buf where mic may be found
1244 *
1245 * This function may modify the xdr buf if the mic is found to be straddling
1246 * a boundary between head, pages, and tail. On success the mic can be read
1247 * from the address returned. There is no need to free the mic.
1248 *
1249 * Return: Success returns 0, otherwise an integer error.
1250 */
1251int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
1244{ 1252{
1245 struct xdr_buf subbuf; 1253 struct xdr_buf subbuf;
1254 unsigned int boundary;
1246 1255
1247 if (xdr_decode_word(buf, offset, &obj->len)) 1256 if (xdr_decode_word(buf, offset, &mic->len))
1248 return -EFAULT; 1257 return -EFAULT;
1249 if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len)) 1258 offset += 4;
1259
1260 /* Is the mic partially in the head? */
1261 boundary = buf->head[0].iov_len;
1262 if (offset < boundary && (offset + mic->len) > boundary)
1263 xdr_shift_buf(buf, boundary - offset);
1264
1265 /* Is the mic partially in the pages? */
1266 boundary += buf->page_len;
1267 if (offset < boundary && (offset + mic->len) > boundary)
1268 xdr_shrink_pagelen(buf, boundary - offset);
1269
1270 if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
1250 return -EFAULT; 1271 return -EFAULT;
1251 1272
1252 /* Is the obj contained entirely in the head? */ 1273 /* Is the mic contained entirely in the head? */
1253 obj->data = subbuf.head[0].iov_base; 1274 mic->data = subbuf.head[0].iov_base;
1254 if (subbuf.head[0].iov_len == obj->len) 1275 if (subbuf.head[0].iov_len == mic->len)
1255 return 0; 1276 return 0;
1256 /* ..or is the obj contained entirely in the tail? */ 1277 /* ..or is the mic contained entirely in the tail? */
1257 obj->data = subbuf.tail[0].iov_base; 1278 mic->data = subbuf.tail[0].iov_base;
1258 if (subbuf.tail[0].iov_len == obj->len) 1279 if (subbuf.tail[0].iov_len == mic->len)
1259 return 0; 1280 return 0;
1260 1281
1261 /* use end of tail as storage for obj: 1282 /* Find a contiguous area in @buf to hold all of @mic */
1262 * (We don't copy to the beginning because then we'd have 1283 if (mic->len > buf->buflen - buf->len)
1263 * to worry about doing a potentially overlapping copy.
1264 * This assumes the object is at most half the length of the
1265 * tail.) */
1266 if (obj->len > buf->buflen - buf->len)
1267 return -ENOMEM; 1284 return -ENOMEM;
1268 if (buf->tail[0].iov_len != 0) 1285 if (buf->tail[0].iov_len != 0)
1269 obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len; 1286 mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
1270 else 1287 else
1271 obj->data = buf->head[0].iov_base + buf->head[0].iov_len; 1288 mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
1272 __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len); 1289 __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
1273 return 0; 1290 return 0;
1274} 1291}
1275EXPORT_SYMBOL_GPL(xdr_buf_read_netobj); 1292EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
1276 1293
1277/* Returns 0 on success, or else a negative error code. */ 1294/* Returns 0 on success, or else a negative error code. */
1278static int 1295static int
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e71f5455c6c..8a45b3ccc313 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -456,6 +456,12 @@ void xprt_release_rqst_cong(struct rpc_task *task)
456} 456}
457EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); 457EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
458 458
459static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt)
460{
461 if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state))
462 __xprt_lock_write_next_cong(xprt);
463}
464
459/* 465/*
460 * Clear the congestion window wait flag and wake up the next 466 * Clear the congestion window wait flag and wake up the next
461 * entry on xprt->sending 467 * entry on xprt->sending
@@ -671,6 +677,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
671 spin_lock(&xprt->transport_lock); 677 spin_lock(&xprt->transport_lock);
672 xprt_clear_connected(xprt); 678 xprt_clear_connected(xprt);
673 xprt_clear_write_space_locked(xprt); 679 xprt_clear_write_space_locked(xprt);
680 xprt_clear_congestion_window_wait_locked(xprt);
674 xprt_wake_pending_tasks(xprt, -ENOTCONN); 681 xprt_wake_pending_tasks(xprt, -ENOTCONN);
675 spin_unlock(&xprt->transport_lock); 682 spin_unlock(&xprt->transport_lock);
676} 683}
@@ -1324,6 +1331,36 @@ xprt_request_dequeue_transmit(struct rpc_task *task)
1324} 1331}
1325 1332
1326/** 1333/**
1334 * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue
1335 * @task: pointer to rpc_task
1336 *
1337 * Remove a task from the transmit and receive queues, and ensure that
1338 * it is not pinned by the receive work item.
1339 */
1340void
1341xprt_request_dequeue_xprt(struct rpc_task *task)
1342{
1343 struct rpc_rqst *req = task->tk_rqstp;
1344 struct rpc_xprt *xprt = req->rq_xprt;
1345
1346 if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
1347 test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
1348 xprt_is_pinned_rqst(req)) {
1349 spin_lock(&xprt->queue_lock);
1350 xprt_request_dequeue_transmit_locked(task);
1351 xprt_request_dequeue_receive_locked(task);
1352 while (xprt_is_pinned_rqst(req)) {
1353 set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
1354 spin_unlock(&xprt->queue_lock);
1355 xprt_wait_on_pinned_rqst(req);
1356 spin_lock(&xprt->queue_lock);
1357 clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
1358 }
1359 spin_unlock(&xprt->queue_lock);
1360 }
1361}
1362
1363/**
1327 * xprt_request_prepare - prepare an encoded request for transport 1364 * xprt_request_prepare - prepare an encoded request for transport
1328 * @req: pointer to rpc_rqst 1365 * @req: pointer to rpc_rqst
1329 * 1366 *
@@ -1747,28 +1784,6 @@ void xprt_retry_reserve(struct rpc_task *task)
1747 xprt_do_reserve(xprt, task); 1784 xprt_do_reserve(xprt, task);
1748} 1785}
1749 1786
1750static void
1751xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req)
1752{
1753 struct rpc_xprt *xprt = req->rq_xprt;
1754
1755 if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
1756 test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
1757 xprt_is_pinned_rqst(req)) {
1758 spin_lock(&xprt->queue_lock);
1759 xprt_request_dequeue_transmit_locked(task);
1760 xprt_request_dequeue_receive_locked(task);
1761 while (xprt_is_pinned_rqst(req)) {
1762 set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
1763 spin_unlock(&xprt->queue_lock);
1764 xprt_wait_on_pinned_rqst(req);
1765 spin_lock(&xprt->queue_lock);
1766 clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
1767 }
1768 spin_unlock(&xprt->queue_lock);
1769 }
1770}
1771
1772/** 1787/**
1773 * xprt_release - release an RPC request slot 1788 * xprt_release - release an RPC request slot
1774 * @task: task which is finished with the slot 1789 * @task: task which is finished with the slot
@@ -1788,7 +1803,7 @@ void xprt_release(struct rpc_task *task)
1788 } 1803 }
1789 1804
1790 xprt = req->rq_xprt; 1805 xprt = req->rq_xprt;
1791 xprt_request_dequeue_all(task, req); 1806 xprt_request_dequeue_xprt(task);
1792 spin_lock(&xprt->transport_lock); 1807 spin_lock(&xprt->transport_lock);
1793 xprt->ops->release_xprt(xprt, task); 1808 xprt->ops->release_xprt(xprt, task);
1794 if (xprt->ops->release_request) 1809 if (xprt->ops->release_request)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 59e624b1d7a0..50e075fcdd8f 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -54,9 +54,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
54 54
55unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt) 55unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
56{ 56{
57 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 57 return RPCRDMA_BACKWARD_WRS >> 1;
58
59 return r_xprt->rx_buf.rb_bc_srv_max_requests;
60} 58}
61 59
62static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) 60static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 0b6dad7580a1..30065a28628c 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -7,67 +7,37 @@
7/* Lightweight memory registration using Fast Registration Work 7/* Lightweight memory registration using Fast Registration Work
8 * Requests (FRWR). 8 * Requests (FRWR).
9 * 9 *
10 * FRWR features ordered asynchronous registration and deregistration 10 * FRWR features ordered asynchronous registration and invalidation
11 * of arbitrarily sized memory regions. This is the fastest and safest 11 * of arbitrarily-sized memory regions. This is the fastest and safest
12 * but most complex memory registration mode. 12 * but most complex memory registration mode.
13 */ 13 */
14 14
15/* Normal operation 15/* Normal operation
16 * 16 *
17 * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG 17 * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
18 * Work Request (frwr_map). When the RDMA operation is finished, this 18 * Work Request (frwr_map). When the RDMA operation is finished, this
19 * Memory Region is invalidated using a LOCAL_INV Work Request 19 * Memory Region is invalidated using a LOCAL_INV Work Request
20 * (frwr_unmap_sync). 20 * (frwr_unmap_async and frwr_unmap_sync).
21 * 21 *
22 * Typically these Work Requests are not signaled, and neither are RDMA 22 * Typically FAST_REG Work Requests are not signaled, and neither are
23 * SEND Work Requests (with the exception of signaling occasionally to 23 * RDMA Send Work Requests (with the exception of signaling occasionally
24 * prevent provider work queue overflows). This greatly reduces HCA 24 * to prevent provider work queue overflows). This greatly reduces HCA
25 * interrupt workload. 25 * interrupt workload.
26 *
27 * As an optimization, frwr_unmap marks MRs INVALID before the
28 * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
29 * rb_mrs immediately so that no work (like managing a linked list
30 * under a spinlock) is needed in the completion upcall.
31 *
32 * But this means that frwr_map() can occasionally encounter an MR
33 * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
34 * ordering prevents a subsequent FAST_REG WR from executing against
35 * that MR while it is still being invalidated.
36 */ 26 */
37 27
38/* Transport recovery 28/* Transport recovery
39 * 29 *
40 * ->op_map and the transport connect worker cannot run at the same 30 * frwr_map and frwr_unmap_* cannot run at the same time the transport
41 * time, but ->op_unmap can fire while the transport connect worker 31 * connect worker is running. The connect worker holds the transport
42 * is running. Thus MR recovery is handled in ->op_map, to guarantee 32 * send lock, just as ->send_request does. This prevents frwr_map and
43 * that recovered MRs are owned by a sending RPC, and not one where 33 * the connect worker from running concurrently. When a connection is
44 * ->op_unmap could fire at the same time transport reconnect is 34 * closed, the Receive completion queue is drained before the allowing
45 * being done. 35 * the connect worker to get control. This prevents frwr_unmap and the
46 * 36 * connect worker from running concurrently.
47 * When the underlying transport disconnects, MRs are left in one of 37 *
48 * four states: 38 * When the underlying transport disconnects, MRs that are in flight
49 * 39 * are flushed and are likely unusable. Thus all flushed MRs are
50 * INVALID: The MR was not in use before the QP entered ERROR state. 40 * destroyed. New MRs are created on demand.
51 *
52 * VALID: The MR was registered before the QP entered ERROR state.
53 *
54 * FLUSHED_FR: The MR was being registered when the QP entered ERROR
55 * state, and the pending WR was flushed.
56 *
57 * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR
58 * state, and the pending WR was flushed.
59 *
60 * When frwr_map encounters FLUSHED and VALID MRs, they are recovered
61 * with ib_dereg_mr and then are re-initialized. Because MR recovery
62 * allocates fresh resources, it is deferred to a workqueue, and the
63 * recovered MRs are placed back on the rb_mrs list when recovery is
64 * complete. frwr_map allocates another MR for the current RPC while
65 * the broken MR is reset.
66 *
67 * To ensure that frwr_map doesn't encounter an MR that is marked
68 * INVALID but that is about to be flushed due to a previous transport
69 * disconnect, the transport connect worker attempts to drain all
70 * pending send queue WRs before the transport is reconnected.
71 */ 41 */
72 42
73#include <linux/sunrpc/rpc_rdma.h> 43#include <linux/sunrpc/rpc_rdma.h>
@@ -118,15 +88,8 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
118 kfree(mr); 88 kfree(mr);
119} 89}
120 90
121/* MRs are dynamically allocated, so simply clean up and release the MR. 91static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
122 * A replacement MR will subsequently be allocated on demand.
123 */
124static void
125frwr_mr_recycle_worker(struct work_struct *work)
126{ 92{
127 struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle);
128 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
129
130 trace_xprtrdma_mr_recycle(mr); 93 trace_xprtrdma_mr_recycle(mr);
131 94
132 if (mr->mr_dir != DMA_NONE) { 95 if (mr->mr_dir != DMA_NONE) {
@@ -136,14 +99,40 @@ frwr_mr_recycle_worker(struct work_struct *work)
136 mr->mr_dir = DMA_NONE; 99 mr->mr_dir = DMA_NONE;
137 } 100 }
138 101
139 spin_lock(&r_xprt->rx_buf.rb_mrlock); 102 spin_lock(&r_xprt->rx_buf.rb_lock);
140 list_del(&mr->mr_all); 103 list_del(&mr->mr_all);
141 r_xprt->rx_stats.mrs_recycled++; 104 r_xprt->rx_stats.mrs_recycled++;
142 spin_unlock(&r_xprt->rx_buf.rb_mrlock); 105 spin_unlock(&r_xprt->rx_buf.rb_lock);
143 106
144 frwr_release_mr(mr); 107 frwr_release_mr(mr);
145} 108}
146 109
110/* MRs are dynamically allocated, so simply clean up and release the MR.
111 * A replacement MR will subsequently be allocated on demand.
112 */
113static void
114frwr_mr_recycle_worker(struct work_struct *work)
115{
116 struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
117 mr_recycle);
118
119 frwr_mr_recycle(mr->mr_xprt, mr);
120}
121
122/* frwr_recycle - Discard MRs
123 * @req: request to reset
124 *
125 * Used after a reconnect. These MRs could be in flight, we can't
126 * tell. Safe thing to do is release them.
127 */
128void frwr_recycle(struct rpcrdma_req *req)
129{
130 struct rpcrdma_mr *mr;
131
132 while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
133 frwr_mr_recycle(mr->mr_xprt, mr);
134}
135
147/* frwr_reset - Place MRs back on the free list 136/* frwr_reset - Place MRs back on the free list
148 * @req: request to reset 137 * @req: request to reset
149 * 138 *
@@ -156,12 +145,10 @@ frwr_mr_recycle_worker(struct work_struct *work)
156 */ 145 */
157void frwr_reset(struct rpcrdma_req *req) 146void frwr_reset(struct rpcrdma_req *req)
158{ 147{
159 while (!list_empty(&req->rl_registered)) { 148 struct rpcrdma_mr *mr;
160 struct rpcrdma_mr *mr;
161 149
162 mr = rpcrdma_mr_pop(&req->rl_registered); 150 while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
163 rpcrdma_mr_unmap_and_put(mr); 151 rpcrdma_mr_put(mr);
164 }
165} 152}
166 153
167/** 154/**
@@ -179,11 +166,14 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
179 struct ib_mr *frmr; 166 struct ib_mr *frmr;
180 int rc; 167 int rc;
181 168
169 /* NB: ib_alloc_mr and device drivers typically allocate
170 * memory with GFP_KERNEL.
171 */
182 frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); 172 frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
183 if (IS_ERR(frmr)) 173 if (IS_ERR(frmr))
184 goto out_mr_err; 174 goto out_mr_err;
185 175
186 sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL); 176 sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
187 if (!sg) 177 if (!sg)
188 goto out_list_err; 178 goto out_list_err;
189 179
@@ -203,8 +193,6 @@ out_mr_err:
203 return rc; 193 return rc;
204 194
205out_list_err: 195out_list_err:
206 dprintk("RPC: %s: sg allocation failure\n",
207 __func__);
208 ib_dereg_mr(frmr); 196 ib_dereg_mr(frmr);
209 return -ENOMEM; 197 return -ENOMEM;
210} 198}
@@ -290,8 +278,8 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
290 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; 278 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
291 ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ 279 ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
292 280
293 ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / 281 ia->ri_max_segs =
294 ia->ri_max_frwr_depth); 282 DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
295 /* Reply chunks require segments for head and tail buffers */ 283 /* Reply chunks require segments for head and tail buffers */
296 ia->ri_max_segs += 2; 284 ia->ri_max_segs += 2;
297 if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) 285 if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
@@ -323,31 +311,25 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
323 * @nsegs: number of segments remaining 311 * @nsegs: number of segments remaining
324 * @writing: true when RDMA Write will be used 312 * @writing: true when RDMA Write will be used
325 * @xid: XID of RPC using the registered memory 313 * @xid: XID of RPC using the registered memory
326 * @out: initialized MR 314 * @mr: MR to fill in
327 * 315 *
328 * Prepare a REG_MR Work Request to register a memory region 316 * Prepare a REG_MR Work Request to register a memory region
329 * for remote access via RDMA READ or RDMA WRITE. 317 * for remote access via RDMA READ or RDMA WRITE.
330 * 318 *
331 * Returns the next segment or a negative errno pointer. 319 * Returns the next segment or a negative errno pointer.
332 * On success, the prepared MR is planted in @out. 320 * On success, @mr is filled in.
333 */ 321 */
334struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, 322struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
335 struct rpcrdma_mr_seg *seg, 323 struct rpcrdma_mr_seg *seg,
336 int nsegs, bool writing, __be32 xid, 324 int nsegs, bool writing, __be32 xid,
337 struct rpcrdma_mr **out) 325 struct rpcrdma_mr *mr)
338{ 326{
339 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 327 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
340 bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
341 struct rpcrdma_mr *mr;
342 struct ib_mr *ibmr;
343 struct ib_reg_wr *reg_wr; 328 struct ib_reg_wr *reg_wr;
329 struct ib_mr *ibmr;
344 int i, n; 330 int i, n;
345 u8 key; 331 u8 key;
346 332
347 mr = rpcrdma_mr_get(r_xprt);
348 if (!mr)
349 goto out_getmr_err;
350
351 if (nsegs > ia->ri_max_frwr_depth) 333 if (nsegs > ia->ri_max_frwr_depth)
352 nsegs = ia->ri_max_frwr_depth; 334 nsegs = ia->ri_max_frwr_depth;
353 for (i = 0; i < nsegs;) { 335 for (i = 0; i < nsegs;) {
@@ -362,7 +344,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
362 344
363 ++seg; 345 ++seg;
364 ++i; 346 ++i;
365 if (holes_ok) 347 if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
366 continue; 348 continue;
367 if ((i < nsegs && offset_in_page(seg->mr_offset)) || 349 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
368 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 350 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -397,22 +379,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
397 mr->mr_offset = ibmr->iova; 379 mr->mr_offset = ibmr->iova;
398 trace_xprtrdma_mr_map(mr); 380 trace_xprtrdma_mr_map(mr);
399 381
400 *out = mr;
401 return seg; 382 return seg;
402 383
403out_getmr_err:
404 xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
405 return ERR_PTR(-EAGAIN);
406
407out_dmamap_err: 384out_dmamap_err:
408 mr->mr_dir = DMA_NONE; 385 mr->mr_dir = DMA_NONE;
409 trace_xprtrdma_frwr_sgerr(mr, i); 386 trace_xprtrdma_frwr_sgerr(mr, i);
410 rpcrdma_mr_put(mr);
411 return ERR_PTR(-EIO); 387 return ERR_PTR(-EIO);
412 388
413out_mapmr_err: 389out_mapmr_err:
414 trace_xprtrdma_frwr_maperr(mr, n); 390 trace_xprtrdma_frwr_maperr(mr, n);
415 rpcrdma_mr_recycle(mr);
416 return ERR_PTR(-EIO); 391 return ERR_PTR(-EIO);
417} 392}
418 393
@@ -485,7 +460,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
485 if (mr->mr_handle == rep->rr_inv_rkey) { 460 if (mr->mr_handle == rep->rr_inv_rkey) {
486 list_del_init(&mr->mr_list); 461 list_del_init(&mr->mr_list);
487 trace_xprtrdma_mr_remoteinv(mr); 462 trace_xprtrdma_mr_remoteinv(mr);
488 rpcrdma_mr_unmap_and_put(mr); 463 rpcrdma_mr_put(mr);
489 break; /* only one invalidated MR per RPC */ 464 break; /* only one invalidated MR per RPC */
490 } 465 }
491} 466}
@@ -495,7 +470,7 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
495 if (wc->status != IB_WC_SUCCESS) 470 if (wc->status != IB_WC_SUCCESS)
496 rpcrdma_mr_recycle(mr); 471 rpcrdma_mr_recycle(mr);
497 else 472 else
498 rpcrdma_mr_unmap_and_put(mr); 473 rpcrdma_mr_put(mr);
499} 474}
500 475
501/** 476/**
@@ -532,8 +507,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
532 507
533 /* WARNING: Only wr_cqe and status are reliable at this point */ 508 /* WARNING: Only wr_cqe and status are reliable at this point */
534 trace_xprtrdma_wc_li_wake(wc, frwr); 509 trace_xprtrdma_wc_li_wake(wc, frwr);
535 complete(&frwr->fr_linv_done);
536 __frwr_release_mr(wc, mr); 510 __frwr_release_mr(wc, mr);
511 complete(&frwr->fr_linv_done);
537} 512}
538 513
539/** 514/**
@@ -562,8 +537,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
562 */ 537 */
563 frwr = NULL; 538 frwr = NULL;
564 prev = &first; 539 prev = &first;
565 while (!list_empty(&req->rl_registered)) { 540 while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
566 mr = rpcrdma_mr_pop(&req->rl_registered);
567 541
568 trace_xprtrdma_mr_localinv(mr); 542 trace_xprtrdma_mr_localinv(mr);
569 r_xprt->rx_stats.local_inv_needed++; 543 r_xprt->rx_stats.local_inv_needed++;
@@ -632,11 +606,15 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
632 struct rpcrdma_frwr *frwr = 606 struct rpcrdma_frwr *frwr =
633 container_of(cqe, struct rpcrdma_frwr, fr_cqe); 607 container_of(cqe, struct rpcrdma_frwr, fr_cqe);
634 struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); 608 struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
609 struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
635 610
636 /* WARNING: Only wr_cqe and status are reliable at this point */ 611 /* WARNING: Only wr_cqe and status are reliable at this point */
637 trace_xprtrdma_wc_li_done(wc, frwr); 612 trace_xprtrdma_wc_li_done(wc, frwr);
638 rpcrdma_complete_rqst(frwr->fr_req->rl_reply);
639 __frwr_release_mr(wc, mr); 613 __frwr_release_mr(wc, mr);
614
615 /* Ensure @rep is generated before __frwr_release_mr */
616 smp_rmb();
617 rpcrdma_complete_rqst(rep);
640} 618}
641 619
642/** 620/**
@@ -662,15 +640,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
662 */ 640 */
663 frwr = NULL; 641 frwr = NULL;
664 prev = &first; 642 prev = &first;
665 while (!list_empty(&req->rl_registered)) { 643 while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
666 mr = rpcrdma_mr_pop(&req->rl_registered);
667 644
668 trace_xprtrdma_mr_localinv(mr); 645 trace_xprtrdma_mr_localinv(mr);
669 r_xprt->rx_stats.local_inv_needed++; 646 r_xprt->rx_stats.local_inv_needed++;
670 647
671 frwr = &mr->frwr; 648 frwr = &mr->frwr;
672 frwr->fr_cqe.done = frwr_wc_localinv; 649 frwr->fr_cqe.done = frwr_wc_localinv;
673 frwr->fr_req = req;
674 last = &frwr->fr_invwr; 650 last = &frwr->fr_invwr;
675 last->next = NULL; 651 last->next = NULL;
676 last->wr_cqe = &frwr->fr_cqe; 652 last->wr_cqe = &frwr->fr_cqe;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 4345e6912392..b86b5fd62d9f 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -342,6 +342,32 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
342 return 0; 342 return 0;
343} 343}
344 344
345static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
346 struct rpcrdma_req *req,
347 struct rpcrdma_mr_seg *seg,
348 int nsegs, bool writing,
349 struct rpcrdma_mr **mr)
350{
351 *mr = rpcrdma_mr_pop(&req->rl_free_mrs);
352 if (!*mr) {
353 *mr = rpcrdma_mr_get(r_xprt);
354 if (!*mr)
355 goto out_getmr_err;
356 trace_xprtrdma_mr_get(req);
357 (*mr)->mr_req = req;
358 }
359
360 rpcrdma_mr_push(*mr, &req->rl_registered);
361 return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
362
363out_getmr_err:
364 trace_xprtrdma_nomrs(req);
365 xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
366 if (r_xprt->rx_ep.rep_connected != -ENODEV)
367 schedule_work(&r_xprt->rx_buf.rb_refresh_worker);
368 return ERR_PTR(-EAGAIN);
369}
370
345/* Register and XDR encode the Read list. Supports encoding a list of read 371/* Register and XDR encode the Read list. Supports encoding a list of read
346 * segments that belong to a single read chunk. 372 * segments that belong to a single read chunk.
347 * 373 *
@@ -356,9 +382,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
356 * 382 *
357 * Only a single @pos value is currently supported. 383 * Only a single @pos value is currently supported.
358 */ 384 */
359static noinline int 385static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
360rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 386 struct rpcrdma_req *req,
361 struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) 387 struct rpc_rqst *rqst,
388 enum rpcrdma_chunktype rtype)
362{ 389{
363 struct xdr_stream *xdr = &req->rl_stream; 390 struct xdr_stream *xdr = &req->rl_stream;
364 struct rpcrdma_mr_seg *seg; 391 struct rpcrdma_mr_seg *seg;
@@ -379,10 +406,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
379 return nsegs; 406 return nsegs;
380 407
381 do { 408 do {
382 seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); 409 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
383 if (IS_ERR(seg)) 410 if (IS_ERR(seg))
384 return PTR_ERR(seg); 411 return PTR_ERR(seg);
385 rpcrdma_mr_push(mr, &req->rl_registered);
386 412
387 if (encode_read_segment(xdr, mr, pos) < 0) 413 if (encode_read_segment(xdr, mr, pos) < 0)
388 return -EMSGSIZE; 414 return -EMSGSIZE;
@@ -411,9 +437,10 @@ done:
411 * 437 *
412 * Only a single Write chunk is currently supported. 438 * Only a single Write chunk is currently supported.
413 */ 439 */
414static noinline int 440static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
415rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 441 struct rpcrdma_req *req,
416 struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 442 struct rpc_rqst *rqst,
443 enum rpcrdma_chunktype wtype)
417{ 444{
418 struct xdr_stream *xdr = &req->rl_stream; 445 struct xdr_stream *xdr = &req->rl_stream;
419 struct rpcrdma_mr_seg *seg; 446 struct rpcrdma_mr_seg *seg;
@@ -440,10 +467,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
440 467
441 nchunks = 0; 468 nchunks = 0;
442 do { 469 do {
443 seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); 470 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
444 if (IS_ERR(seg)) 471 if (IS_ERR(seg))
445 return PTR_ERR(seg); 472 return PTR_ERR(seg);
446 rpcrdma_mr_push(mr, &req->rl_registered);
447 473
448 if (encode_rdma_segment(xdr, mr) < 0) 474 if (encode_rdma_segment(xdr, mr) < 0)
449 return -EMSGSIZE; 475 return -EMSGSIZE;
@@ -474,9 +500,10 @@ done:
474 * Returns zero on success, or a negative errno if a failure occurred. 500 * Returns zero on success, or a negative errno if a failure occurred.
475 * @xdr is advanced to the next position in the stream. 501 * @xdr is advanced to the next position in the stream.
476 */ 502 */
477static noinline int 503static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
478rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 504 struct rpcrdma_req *req,
479 struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 505 struct rpc_rqst *rqst,
506 enum rpcrdma_chunktype wtype)
480{ 507{
481 struct xdr_stream *xdr = &req->rl_stream; 508 struct xdr_stream *xdr = &req->rl_stream;
482 struct rpcrdma_mr_seg *seg; 509 struct rpcrdma_mr_seg *seg;
@@ -501,10 +528,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
501 528
502 nchunks = 0; 529 nchunks = 0;
503 do { 530 do {
504 seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); 531 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
505 if (IS_ERR(seg)) 532 if (IS_ERR(seg))
506 return PTR_ERR(seg); 533 return PTR_ERR(seg);
507 rpcrdma_mr_push(mr, &req->rl_registered);
508 534
509 if (encode_rdma_segment(xdr, mr) < 0) 535 if (encode_rdma_segment(xdr, mr) < 0)
510 return -EMSGSIZE; 536 return -EMSGSIZE;
@@ -841,12 +867,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
841 * chunks. Very likely the connection has been replaced, 867 * chunks. Very likely the connection has been replaced,
842 * so these registrations are invalid and unusable. 868 * so these registrations are invalid and unusable.
843 */ 869 */
844 while (unlikely(!list_empty(&req->rl_registered))) { 870 frwr_recycle(req);
845 struct rpcrdma_mr *mr;
846
847 mr = rpcrdma_mr_pop(&req->rl_registered);
848 rpcrdma_mr_recycle(mr);
849 }
850 871
851 /* This implementation supports the following combinations 872 /* This implementation supports the following combinations
852 * of chunk lists in one RPC-over-RDMA Call message: 873 * of chunk lists in one RPC-over-RDMA Call message:
@@ -1240,8 +1261,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
1240 struct rpc_rqst *rqst = rep->rr_rqst; 1261 struct rpc_rqst *rqst = rep->rr_rqst;
1241 int status; 1262 int status;
1242 1263
1243 xprt->reestablish_timeout = 0;
1244
1245 switch (rep->rr_proc) { 1264 switch (rep->rr_proc) {
1246 case rdma_msg: 1265 case rdma_msg:
1247 status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1266 status = rpcrdma_decode_msg(r_xprt, rep, rqst);
@@ -1300,6 +1319,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1300 u32 credits; 1319 u32 credits;
1301 __be32 *p; 1320 __be32 *p;
1302 1321
1322 /* Any data means we had a useful conversation, so
1323 * then we don't need to delay the next reconnect.
1324 */
1325 if (xprt->reestablish_timeout)
1326 xprt->reestablish_timeout = 0;
1327
1303 /* Fixed transport header fields */ 1328 /* Fixed transport header fields */
1304 xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, 1329 xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
1305 rep->rr_hdrbuf.head[0].iov_base, NULL); 1330 rep->rr_hdrbuf.head[0].iov_base, NULL);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2ec349ed4770..160558b4135e 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -423,8 +423,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt)
423 423
424 if (ep->rep_connected == -ENODEV) 424 if (ep->rep_connected == -ENODEV)
425 return; 425 return;
426 if (ep->rep_connected > 0)
427 xprt->reestablish_timeout = 0;
428 rpcrdma_ep_disconnect(ep, ia); 426 rpcrdma_ep_disconnect(ep, ia);
429 427
430 /* Prepare @xprt for the next connection by reinitializing 428 /* Prepare @xprt for the next connection by reinitializing
@@ -434,6 +432,7 @@ void xprt_rdma_close(struct rpc_xprt *xprt)
434 xprt->cwnd = RPC_CWNDSHIFT; 432 xprt->cwnd = RPC_CWNDSHIFT;
435 433
436out: 434out:
435 xprt->reestablish_timeout = 0;
437 ++xprt->connect_cookie; 436 ++xprt->connect_cookie;
438 xprt_disconnect_done(xprt); 437 xprt_disconnect_done(xprt);
439} 438}
@@ -494,9 +493,9 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
494 * @reconnect_timeout: reconnect timeout after server disconnects 493 * @reconnect_timeout: reconnect timeout after server disconnects
495 * 494 *
496 */ 495 */
497static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt, 496static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt,
498 unsigned long connect_timeout, 497 unsigned long connect_timeout,
499 unsigned long reconnect_timeout) 498 unsigned long reconnect_timeout)
500{ 499{
501 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 500 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
502 501
@@ -571,6 +570,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
571 return; 570 return;
572 571
573out_sleep: 572out_sleep:
573 set_bit(XPRT_CONGESTED, &xprt->state);
574 rpc_sleep_on(&xprt->backlog, task, NULL); 574 rpc_sleep_on(&xprt->backlog, task, NULL);
575 task->tk_status = -EAGAIN; 575 task->tk_status = -EAGAIN;
576} 576}
@@ -589,7 +589,8 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
589 589
590 memset(rqst, 0, sizeof(*rqst)); 590 memset(rqst, 0, sizeof(*rqst));
591 rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); 591 rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
592 rpc_wake_up_next(&xprt->backlog); 592 if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
593 clear_bit(XPRT_CONGESTED, &xprt->state);
593} 594}
594 595
595static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, 596static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
@@ -803,7 +804,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
803 .send_request = xprt_rdma_send_request, 804 .send_request = xprt_rdma_send_request,
804 .close = xprt_rdma_close, 805 .close = xprt_rdma_close,
805 .destroy = xprt_rdma_destroy, 806 .destroy = xprt_rdma_destroy,
806 .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout, 807 .set_connect_timeout = xprt_rdma_set_connect_timeout,
807 .print_stats = xprt_rdma_print_stats, 808 .print_stats = xprt_rdma_print_stats,
808 .enable_swap = xprt_rdma_enable_swap, 809 .enable_swap = xprt_rdma_enable_swap,
809 .disable_swap = xprt_rdma_disable_swap, 810 .disable_swap = xprt_rdma_disable_swap,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b10aa16557f0..3a907537e2cf 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -53,6 +53,7 @@
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/sunrpc/addr.h> 54#include <linux/sunrpc/addr.h>
55#include <linux/sunrpc/svc_rdma.h> 55#include <linux/sunrpc/svc_rdma.h>
56#include <linux/log2.h>
56 57
57#include <asm-generic/barrier.h> 58#include <asm-generic/barrier.h>
58#include <asm/bitops.h> 59#include <asm/bitops.h>
@@ -74,8 +75,10 @@
74 * internal functions 75 * internal functions
75 */ 76 */
76static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); 77static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
78static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf);
77static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); 79static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
78static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); 80static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
81static void rpcrdma_mr_free(struct rpcrdma_mr *mr);
79static struct rpcrdma_regbuf * 82static struct rpcrdma_regbuf *
80rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, 83rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
81 gfp_t flags); 84 gfp_t flags);
@@ -405,9 +408,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
405 struct rpcrdma_ep *ep = &r_xprt->rx_ep; 408 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
406 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 409 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
407 struct rpcrdma_req *req; 410 struct rpcrdma_req *req;
408 struct rpcrdma_rep *rep;
409 411
410 cancel_delayed_work_sync(&buf->rb_refresh_worker); 412 cancel_work_sync(&buf->rb_refresh_worker);
411 413
412 /* This is similar to rpcrdma_ep_destroy, but: 414 /* This is similar to rpcrdma_ep_destroy, but:
413 * - Don't cancel the connect worker. 415 * - Don't cancel the connect worker.
@@ -429,8 +431,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
429 /* The ULP is responsible for ensuring all DMA 431 /* The ULP is responsible for ensuring all DMA
430 * mappings and MRs are gone. 432 * mappings and MRs are gone.
431 */ 433 */
432 list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) 434 rpcrdma_reps_destroy(buf);
433 rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
434 list_for_each_entry(req, &buf->rb_allreqs, rl_all) { 435 list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
435 rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); 436 rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf);
436 rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); 437 rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
@@ -604,10 +605,10 @@ void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
604 * Unlike a normal reconnection, a fresh PD and a new set 605 * Unlike a normal reconnection, a fresh PD and a new set
605 * of MRs and buffers is needed. 606 * of MRs and buffers is needed.
606 */ 607 */
607static int 608static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
608rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, 609 struct ib_qp_init_attr *qp_init_attr)
609 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
610{ 610{
611 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
611 int rc, err; 612 int rc, err;
612 613
613 trace_xprtrdma_reinsert(r_xprt); 614 trace_xprtrdma_reinsert(r_xprt);
@@ -624,7 +625,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
624 } 625 }
625 626
626 rc = -ENETUNREACH; 627 rc = -ENETUNREACH;
627 err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 628 err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
628 if (err) { 629 if (err) {
629 pr_err("rpcrdma: rdma_create_qp returned %d\n", err); 630 pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
630 goto out3; 631 goto out3;
@@ -641,16 +642,16 @@ out1:
641 return rc; 642 return rc;
642} 643}
643 644
644static int 645static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
645rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, 646 struct ib_qp_init_attr *qp_init_attr)
646 struct rpcrdma_ia *ia)
647{ 647{
648 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
648 struct rdma_cm_id *id, *old; 649 struct rdma_cm_id *id, *old;
649 int err, rc; 650 int err, rc;
650 651
651 trace_xprtrdma_reconnect(r_xprt); 652 trace_xprtrdma_reconnect(r_xprt);
652 653
653 rpcrdma_ep_disconnect(ep, ia); 654 rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
654 655
655 rc = -EHOSTUNREACH; 656 rc = -EHOSTUNREACH;
656 id = rpcrdma_create_id(r_xprt, ia); 657 id = rpcrdma_create_id(r_xprt, ia);
@@ -672,7 +673,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
672 goto out_destroy; 673 goto out_destroy;
673 } 674 }
674 675
675 err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); 676 err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
676 if (err) 677 if (err)
677 goto out_destroy; 678 goto out_destroy;
678 679
@@ -697,25 +698,27 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
697 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, 698 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
698 rx_ia); 699 rx_ia);
699 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 700 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
701 struct ib_qp_init_attr qp_init_attr;
700 int rc; 702 int rc;
701 703
702retry: 704retry:
705 memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
703 switch (ep->rep_connected) { 706 switch (ep->rep_connected) {
704 case 0: 707 case 0:
705 dprintk("RPC: %s: connecting...\n", __func__); 708 dprintk("RPC: %s: connecting...\n", __func__);
706 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 709 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
707 if (rc) { 710 if (rc) {
708 rc = -ENETUNREACH; 711 rc = -ENETUNREACH;
709 goto out_noupdate; 712 goto out_noupdate;
710 } 713 }
711 break; 714 break;
712 case -ENODEV: 715 case -ENODEV:
713 rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); 716 rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
714 if (rc) 717 if (rc)
715 goto out_noupdate; 718 goto out_noupdate;
716 break; 719 break;
717 default: 720 default:
718 rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); 721 rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
719 if (rc) 722 if (rc)
720 goto out; 723 goto out;
721 } 724 }
@@ -729,6 +732,8 @@ retry:
729 if (rc) 732 if (rc)
730 goto out; 733 goto out;
731 734
735 if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
736 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
732 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 737 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
733 if (ep->rep_connected <= 0) { 738 if (ep->rep_connected <= 0) {
734 if (ep->rep_connected == -EAGAIN) 739 if (ep->rep_connected == -EAGAIN)
@@ -942,14 +947,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
942 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 947 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
943 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 948 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
944 unsigned int count; 949 unsigned int count;
945 LIST_HEAD(free);
946 LIST_HEAD(all);
947 950
948 for (count = 0; count < ia->ri_max_segs; count++) { 951 for (count = 0; count < ia->ri_max_segs; count++) {
949 struct rpcrdma_mr *mr; 952 struct rpcrdma_mr *mr;
950 int rc; 953 int rc;
951 954
952 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 955 mr = kzalloc(sizeof(*mr), GFP_NOFS);
953 if (!mr) 956 if (!mr)
954 break; 957 break;
955 958
@@ -961,15 +964,13 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
961 964
962 mr->mr_xprt = r_xprt; 965 mr->mr_xprt = r_xprt;
963 966
964 list_add(&mr->mr_list, &free); 967 spin_lock(&buf->rb_lock);
965 list_add(&mr->mr_all, &all); 968 list_add(&mr->mr_list, &buf->rb_mrs);
969 list_add(&mr->mr_all, &buf->rb_all_mrs);
970 spin_unlock(&buf->rb_lock);
966 } 971 }
967 972
968 spin_lock(&buf->rb_mrlock);
969 list_splice(&free, &buf->rb_mrs);
970 list_splice(&all, &buf->rb_all);
971 r_xprt->rx_stats.mrs_allocated += count; 973 r_xprt->rx_stats.mrs_allocated += count;
972 spin_unlock(&buf->rb_mrlock);
973 trace_xprtrdma_createmrs(r_xprt, count); 974 trace_xprtrdma_createmrs(r_xprt, count);
974} 975}
975 976
@@ -977,7 +978,7 @@ static void
977rpcrdma_mr_refresh_worker(struct work_struct *work) 978rpcrdma_mr_refresh_worker(struct work_struct *work)
978{ 979{
979 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, 980 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
980 rb_refresh_worker.work); 981 rb_refresh_worker);
981 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, 982 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
982 rx_buf); 983 rx_buf);
983 984
@@ -999,12 +1000,18 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
999 struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; 1000 struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
1000 struct rpcrdma_regbuf *rb; 1001 struct rpcrdma_regbuf *rb;
1001 struct rpcrdma_req *req; 1002 struct rpcrdma_req *req;
1003 size_t maxhdrsize;
1002 1004
1003 req = kzalloc(sizeof(*req), flags); 1005 req = kzalloc(sizeof(*req), flags);
1004 if (req == NULL) 1006 if (req == NULL)
1005 goto out1; 1007 goto out1;
1006 1008
1007 rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); 1009 /* Compute maximum header buffer size in bytes */
1010 maxhdrsize = rpcrdma_fixed_maxsz + 3 +
1011 r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz;
1012 maxhdrsize *= sizeof(__be32);
1013 rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
1014 DMA_TO_DEVICE, flags);
1008 if (!rb) 1015 if (!rb)
1009 goto out2; 1016 goto out2;
1010 req->rl_rdmabuf = rb; 1017 req->rl_rdmabuf = rb;
@@ -1018,6 +1025,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
1018 if (!req->rl_recvbuf) 1025 if (!req->rl_recvbuf)
1019 goto out4; 1026 goto out4;
1020 1027
1028 INIT_LIST_HEAD(&req->rl_free_mrs);
1021 INIT_LIST_HEAD(&req->rl_registered); 1029 INIT_LIST_HEAD(&req->rl_registered);
1022 spin_lock(&buffer->rb_lock); 1030 spin_lock(&buffer->rb_lock);
1023 list_add(&req->rl_all, &buffer->rb_allreqs); 1031 list_add(&req->rl_all, &buffer->rb_allreqs);
@@ -1065,6 +1073,40 @@ out:
1065 return NULL; 1073 return NULL;
1066} 1074}
1067 1075
1076static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
1077{
1078 rpcrdma_regbuf_free(rep->rr_rdmabuf);
1079 kfree(rep);
1080}
1081
1082static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
1083{
1084 struct llist_node *node;
1085
1086 /* Calls to llist_del_first are required to be serialized */
1087 node = llist_del_first(&buf->rb_free_reps);
1088 if (!node)
1089 return NULL;
1090 return llist_entry(node, struct rpcrdma_rep, rr_node);
1091}
1092
1093static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
1094 struct rpcrdma_rep *rep)
1095{
1096 if (!rep->rr_temp)
1097 llist_add(&rep->rr_node, &buf->rb_free_reps);
1098 else
1099 rpcrdma_rep_destroy(rep);
1100}
1101
1102static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
1103{
1104 struct rpcrdma_rep *rep;
1105
1106 while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
1107 rpcrdma_rep_destroy(rep);
1108}
1109
1068/** 1110/**
1069 * rpcrdma_buffer_create - Create initial set of req/rep objects 1111 * rpcrdma_buffer_create - Create initial set of req/rep objects
1070 * @r_xprt: transport instance to (re)initialize 1112 * @r_xprt: transport instance to (re)initialize
@@ -1078,12 +1120,10 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1078 1120
1079 buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests; 1121 buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
1080 buf->rb_bc_srv_max_requests = 0; 1122 buf->rb_bc_srv_max_requests = 0;
1081 spin_lock_init(&buf->rb_mrlock);
1082 spin_lock_init(&buf->rb_lock); 1123 spin_lock_init(&buf->rb_lock);
1083 INIT_LIST_HEAD(&buf->rb_mrs); 1124 INIT_LIST_HEAD(&buf->rb_mrs);
1084 INIT_LIST_HEAD(&buf->rb_all); 1125 INIT_LIST_HEAD(&buf->rb_all_mrs);
1085 INIT_DELAYED_WORK(&buf->rb_refresh_worker, 1126 INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker);
1086 rpcrdma_mr_refresh_worker);
1087 1127
1088 rpcrdma_mrs_create(r_xprt); 1128 rpcrdma_mrs_create(r_xprt);
1089 1129
@@ -1102,7 +1142,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1102 } 1142 }
1103 1143
1104 buf->rb_credits = 1; 1144 buf->rb_credits = 1;
1105 INIT_LIST_HEAD(&buf->rb_recv_bufs); 1145 init_llist_head(&buf->rb_free_reps);
1106 1146
1107 rc = rpcrdma_sendctxs_create(r_xprt); 1147 rc = rpcrdma_sendctxs_create(r_xprt);
1108 if (rc) 1148 if (rc)
@@ -1114,12 +1154,6 @@ out:
1114 return rc; 1154 return rc;
1115} 1155}
1116 1156
1117static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
1118{
1119 rpcrdma_regbuf_free(rep->rr_rdmabuf);
1120 kfree(rep);
1121}
1122
1123/** 1157/**
1124 * rpcrdma_req_destroy - Destroy an rpcrdma_req object 1158 * rpcrdma_req_destroy - Destroy an rpcrdma_req object
1125 * @req: unused object to be destroyed 1159 * @req: unused object to be destroyed
@@ -1127,11 +1161,13 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
1127 * This function assumes that the caller prevents concurrent device 1161 * This function assumes that the caller prevents concurrent device
1128 * unload and transport tear-down. 1162 * unload and transport tear-down.
1129 */ 1163 */
1130void 1164void rpcrdma_req_destroy(struct rpcrdma_req *req)
1131rpcrdma_req_destroy(struct rpcrdma_req *req)
1132{ 1165{
1133 list_del(&req->rl_all); 1166 list_del(&req->rl_all);
1134 1167
1168 while (!list_empty(&req->rl_free_mrs))
1169 rpcrdma_mr_free(rpcrdma_mr_pop(&req->rl_free_mrs));
1170
1135 rpcrdma_regbuf_free(req->rl_recvbuf); 1171 rpcrdma_regbuf_free(req->rl_recvbuf);
1136 rpcrdma_regbuf_free(req->rl_sendbuf); 1172 rpcrdma_regbuf_free(req->rl_sendbuf);
1137 rpcrdma_regbuf_free(req->rl_rdmabuf); 1173 rpcrdma_regbuf_free(req->rl_rdmabuf);
@@ -1147,25 +1183,19 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
1147 unsigned int count; 1183 unsigned int count;
1148 1184
1149 count = 0; 1185 count = 0;
1150 spin_lock(&buf->rb_mrlock); 1186 spin_lock(&buf->rb_lock);
1151 while (!list_empty(&buf->rb_all)) { 1187 while ((mr = list_first_entry_or_null(&buf->rb_all_mrs,
1152 mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); 1188 struct rpcrdma_mr,
1189 mr_all)) != NULL) {
1153 list_del(&mr->mr_all); 1190 list_del(&mr->mr_all);
1154 1191 spin_unlock(&buf->rb_lock);
1155 spin_unlock(&buf->rb_mrlock);
1156
1157 /* Ensure MW is not on any rl_registered list */
1158 if (!list_empty(&mr->mr_list))
1159 list_del(&mr->mr_list);
1160 1192
1161 frwr_release_mr(mr); 1193 frwr_release_mr(mr);
1162 count++; 1194 count++;
1163 spin_lock(&buf->rb_mrlock); 1195 spin_lock(&buf->rb_lock);
1164 } 1196 }
1165 spin_unlock(&buf->rb_mrlock); 1197 spin_unlock(&buf->rb_lock);
1166 r_xprt->rx_stats.mrs_allocated = 0; 1198 r_xprt->rx_stats.mrs_allocated = 0;
1167
1168 dprintk("RPC: %s: released %u MRs\n", __func__, count);
1169} 1199}
1170 1200
1171/** 1201/**
@@ -1179,18 +1209,10 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
1179void 1209void
1180rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1210rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1181{ 1211{
1182 cancel_delayed_work_sync(&buf->rb_refresh_worker); 1212 cancel_work_sync(&buf->rb_refresh_worker);
1183 1213
1184 rpcrdma_sendctxs_destroy(buf); 1214 rpcrdma_sendctxs_destroy(buf);
1185 1215 rpcrdma_reps_destroy(buf);
1186 while (!list_empty(&buf->rb_recv_bufs)) {
1187 struct rpcrdma_rep *rep;
1188
1189 rep = list_first_entry(&buf->rb_recv_bufs,
1190 struct rpcrdma_rep, rr_list);
1191 list_del(&rep->rr_list);
1192 rpcrdma_rep_destroy(rep);
1193 }
1194 1216
1195 while (!list_empty(&buf->rb_send_bufs)) { 1217 while (!list_empty(&buf->rb_send_bufs)) {
1196 struct rpcrdma_req *req; 1218 struct rpcrdma_req *req;
@@ -1215,54 +1237,20 @@ struct rpcrdma_mr *
1215rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) 1237rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
1216{ 1238{
1217 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1239 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1218 struct rpcrdma_mr *mr = NULL; 1240 struct rpcrdma_mr *mr;
1219
1220 spin_lock(&buf->rb_mrlock);
1221 if (!list_empty(&buf->rb_mrs))
1222 mr = rpcrdma_mr_pop(&buf->rb_mrs);
1223 spin_unlock(&buf->rb_mrlock);
1224 1241
1225 if (!mr) 1242 spin_lock(&buf->rb_lock);
1226 goto out_nomrs; 1243 mr = rpcrdma_mr_pop(&buf->rb_mrs);
1244 spin_unlock(&buf->rb_lock);
1227 return mr; 1245 return mr;
1228
1229out_nomrs:
1230 trace_xprtrdma_nomrs(r_xprt);
1231 if (r_xprt->rx_ep.rep_connected != -ENODEV)
1232 schedule_delayed_work(&buf->rb_refresh_worker, 0);
1233
1234 /* Allow the reply handler and refresh worker to run */
1235 cond_resched();
1236
1237 return NULL;
1238}
1239
1240static void
1241__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
1242{
1243 spin_lock(&buf->rb_mrlock);
1244 rpcrdma_mr_push(mr, &buf->rb_mrs);
1245 spin_unlock(&buf->rb_mrlock);
1246}
1247
1248/**
1249 * rpcrdma_mr_put - Release an rpcrdma_mr object
1250 * @mr: object to release
1251 *
1252 */
1253void
1254rpcrdma_mr_put(struct rpcrdma_mr *mr)
1255{
1256 __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
1257} 1246}
1258 1247
1259/** 1248/**
1260 * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it 1249 * rpcrdma_mr_put - DMA unmap an MR and release it
1261 * @mr: object to release 1250 * @mr: MR to release
1262 * 1251 *
1263 */ 1252 */
1264void 1253void rpcrdma_mr_put(struct rpcrdma_mr *mr)
1265rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
1266{ 1254{
1267 struct rpcrdma_xprt *r_xprt = mr->mr_xprt; 1255 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1268 1256
@@ -1272,7 +1260,19 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
1272 mr->mr_sg, mr->mr_nents, mr->mr_dir); 1260 mr->mr_sg, mr->mr_nents, mr->mr_dir);
1273 mr->mr_dir = DMA_NONE; 1261 mr->mr_dir = DMA_NONE;
1274 } 1262 }
1275 __rpcrdma_mr_put(&r_xprt->rx_buf, mr); 1263
1264 rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
1265}
1266
1267static void rpcrdma_mr_free(struct rpcrdma_mr *mr)
1268{
1269 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1270 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1271
1272 mr->mr_req = NULL;
1273 spin_lock(&buf->rb_lock);
1274 rpcrdma_mr_push(mr, &buf->rb_mrs);
1275 spin_unlock(&buf->rb_lock);
1276} 1276}
1277 1277
1278/** 1278/**
@@ -1303,39 +1303,24 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1303 */ 1303 */
1304void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) 1304void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
1305{ 1305{
1306 struct rpcrdma_rep *rep = req->rl_reply; 1306 if (req->rl_reply)
1307 1307 rpcrdma_rep_put(buffers, req->rl_reply);
1308 req->rl_reply = NULL; 1308 req->rl_reply = NULL;
1309 1309
1310 spin_lock(&buffers->rb_lock); 1310 spin_lock(&buffers->rb_lock);
1311 list_add(&req->rl_list, &buffers->rb_send_bufs); 1311 list_add(&req->rl_list, &buffers->rb_send_bufs);
1312 if (rep) {
1313 if (!rep->rr_temp) {
1314 list_add(&rep->rr_list, &buffers->rb_recv_bufs);
1315 rep = NULL;
1316 }
1317 }
1318 spin_unlock(&buffers->rb_lock); 1312 spin_unlock(&buffers->rb_lock);
1319 if (rep)
1320 rpcrdma_rep_destroy(rep);
1321} 1313}
1322 1314
1323/* 1315/**
1324 * Put reply buffers back into pool when not attached to 1316 * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
1325 * request. This happens in error conditions. 1317 * @rep: rep to release
1318 *
1319 * Used after error conditions.
1326 */ 1320 */
1327void 1321void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1328rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1329{ 1322{
1330 struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; 1323 rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
1331
1332 if (!rep->rr_temp) {
1333 spin_lock(&buffers->rb_lock);
1334 list_add(&rep->rr_list, &buffers->rb_recv_bufs);
1335 spin_unlock(&buffers->rb_lock);
1336 } else {
1337 rpcrdma_rep_destroy(rep);
1338 }
1339} 1324}
1340 1325
1341/* Returns a pointer to a rpcrdma_regbuf object, or NULL. 1326/* Returns a pointer to a rpcrdma_regbuf object, or NULL.
@@ -1483,7 +1468,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1483 count = 0; 1468 count = 0;
1484 1469
1485 needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); 1470 needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
1486 if (ep->rep_receive_count > needed) 1471 if (likely(ep->rep_receive_count > needed))
1487 goto out; 1472 goto out;
1488 needed -= ep->rep_receive_count; 1473 needed -= ep->rep_receive_count;
1489 if (!temp) 1474 if (!temp)
@@ -1491,22 +1476,10 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1491 1476
1492 /* fast path: all needed reps can be found on the free list */ 1477 /* fast path: all needed reps can be found on the free list */
1493 wr = NULL; 1478 wr = NULL;
1494 spin_lock(&buf->rb_lock);
1495 while (needed) { 1479 while (needed) {
1496 rep = list_first_entry_or_null(&buf->rb_recv_bufs, 1480 rep = rpcrdma_rep_get_locked(buf);
1497 struct rpcrdma_rep, rr_list);
1498 if (!rep) 1481 if (!rep)
1499 break; 1482 rep = rpcrdma_rep_create(r_xprt, temp);
1500
1501 list_del(&rep->rr_list);
1502 rep->rr_recv_wr.next = wr;
1503 wr = &rep->rr_recv_wr;
1504 --needed;
1505 }
1506 spin_unlock(&buf->rb_lock);
1507
1508 while (needed) {
1509 rep = rpcrdma_rep_create(r_xprt, temp);
1510 if (!rep) 1483 if (!rep)
1511 break; 1484 break;
1512 1485
@@ -1523,7 +1496,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
1523 if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) 1496 if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
1524 goto release_wrs; 1497 goto release_wrs;
1525 1498
1526 trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); 1499 trace_xprtrdma_post_recv(rep);
1527 ++count; 1500 ++count;
1528 } 1501 }
1529 1502
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 92ce09fcea74..65e6b0eb862e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -47,6 +47,7 @@
47#include <linux/atomic.h> /* atomic_t, etc */ 47#include <linux/atomic.h> /* atomic_t, etc */
48#include <linux/kref.h> /* struct kref */ 48#include <linux/kref.h> /* struct kref */
49#include <linux/workqueue.h> /* struct work_struct */ 49#include <linux/workqueue.h> /* struct work_struct */
50#include <linux/llist.h>
50 51
51#include <rdma/rdma_cm.h> /* RDMA connection api */ 52#include <rdma/rdma_cm.h> /* RDMA connection api */
52#include <rdma/ib_verbs.h> /* RDMA verbs api */ 53#include <rdma/ib_verbs.h> /* RDMA verbs api */
@@ -117,9 +118,6 @@ struct rpcrdma_ep {
117#endif 118#endif
118 119
119/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV 120/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
120 *
121 * The below structure appears at the front of a large region of kmalloc'd
122 * memory, which always starts on a good alignment boundary.
123 */ 121 */
124 122
125struct rpcrdma_regbuf { 123struct rpcrdma_regbuf {
@@ -158,25 +156,22 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
158 156
159/* To ensure a transport can always make forward progress, 157/* To ensure a transport can always make forward progress,
160 * the number of RDMA segments allowed in header chunk lists 158 * the number of RDMA segments allowed in header chunk lists
161 * is capped at 8. This prevents less-capable devices and 159 * is capped at 16. This prevents less-capable devices from
162 * memory registrations from overrunning the Send buffer 160 * overrunning the Send buffer while building chunk lists.
163 * while building chunk lists.
164 * 161 *
165 * Elements of the Read list take up more room than the 162 * Elements of the Read list take up more room than the
166 * Write list or Reply chunk. 8 read segments means the Read 163 * Write list or Reply chunk. 16 read segments means the
167 * list (or Write list or Reply chunk) cannot consume more 164 * chunk lists cannot consume more than
168 * than
169 *
170 * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
171 * 165 *
172 * And the fixed part of the header is another 24 bytes. 166 * ((16 + 2) * read segment size) + 1 XDR words,
173 * 167 *
174 * The smallest inline threshold is 1024 bytes, ensuring that 168 * or about 400 bytes. The fixed part of the header is
175 * at least 750 bytes are available for RPC messages. 169 * another 24 bytes. Thus when the inline threshold is
170 * 1024 bytes, at least 600 bytes are available for RPC
171 * message bodies.
176 */ 172 */
177enum { 173enum {
178 RPCRDMA_MAX_HDR_SEGS = 8, 174 RPCRDMA_MAX_HDR_SEGS = 16,
179 RPCRDMA_HDRBUF_SIZE = 256,
180}; 175};
181 176
182/* 177/*
@@ -206,7 +201,7 @@ struct rpcrdma_rep {
206 struct rpc_rqst *rr_rqst; 201 struct rpc_rqst *rr_rqst;
207 struct xdr_buf rr_hdrbuf; 202 struct xdr_buf rr_hdrbuf;
208 struct xdr_stream rr_stream; 203 struct xdr_stream rr_stream;
209 struct list_head rr_list; 204 struct llist_node rr_node;
210 struct ib_recv_wr rr_recv_wr; 205 struct ib_recv_wr rr_recv_wr;
211}; 206};
212 207
@@ -240,20 +235,20 @@ struct rpcrdma_sendctx {
240 * An external memory region is any buffer or page that is registered 235 * An external memory region is any buffer or page that is registered
241 * on the fly (ie, not pre-registered). 236 * on the fly (ie, not pre-registered).
242 */ 237 */
243struct rpcrdma_req;
244struct rpcrdma_frwr { 238struct rpcrdma_frwr {
245 struct ib_mr *fr_mr; 239 struct ib_mr *fr_mr;
246 struct ib_cqe fr_cqe; 240 struct ib_cqe fr_cqe;
247 struct completion fr_linv_done; 241 struct completion fr_linv_done;
248 struct rpcrdma_req *fr_req;
249 union { 242 union {
250 struct ib_reg_wr fr_regwr; 243 struct ib_reg_wr fr_regwr;
251 struct ib_send_wr fr_invwr; 244 struct ib_send_wr fr_invwr;
252 }; 245 };
253}; 246};
254 247
248struct rpcrdma_req;
255struct rpcrdma_mr { 249struct rpcrdma_mr {
256 struct list_head mr_list; 250 struct list_head mr_list;
251 struct rpcrdma_req *mr_req;
257 struct scatterlist *mr_sg; 252 struct scatterlist *mr_sg;
258 int mr_nents; 253 int mr_nents;
259 enum dma_data_direction mr_dir; 254 enum dma_data_direction mr_dir;
@@ -331,7 +326,8 @@ struct rpcrdma_req {
331 struct list_head rl_all; 326 struct list_head rl_all;
332 struct kref rl_kref; 327 struct kref rl_kref;
333 328
334 struct list_head rl_registered; /* registered segments */ 329 struct list_head rl_free_mrs;
330 struct list_head rl_registered;
335 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 331 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
336}; 332};
337 333
@@ -344,7 +340,7 @@ rpcr_to_rdmar(const struct rpc_rqst *rqst)
344static inline void 340static inline void
345rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) 341rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
346{ 342{
347 list_add_tail(&mr->mr_list, list); 343 list_add(&mr->mr_list, list);
348} 344}
349 345
350static inline struct rpcrdma_mr * 346static inline struct rpcrdma_mr *
@@ -352,8 +348,9 @@ rpcrdma_mr_pop(struct list_head *list)
352{ 348{
353 struct rpcrdma_mr *mr; 349 struct rpcrdma_mr *mr;
354 350
355 mr = list_first_entry(list, struct rpcrdma_mr, mr_list); 351 mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list);
356 list_del_init(&mr->mr_list); 352 if (mr)
353 list_del_init(&mr->mr_list);
357 return mr; 354 return mr;
358} 355}
359 356
@@ -364,19 +361,19 @@ rpcrdma_mr_pop(struct list_head *list)
364 * One of these is associated with a transport instance 361 * One of these is associated with a transport instance
365 */ 362 */
366struct rpcrdma_buffer { 363struct rpcrdma_buffer {
367 spinlock_t rb_mrlock; /* protect rb_mrs list */ 364 spinlock_t rb_lock;
365 struct list_head rb_send_bufs;
368 struct list_head rb_mrs; 366 struct list_head rb_mrs;
369 struct list_head rb_all;
370 367
371 unsigned long rb_sc_head; 368 unsigned long rb_sc_head;
372 unsigned long rb_sc_tail; 369 unsigned long rb_sc_tail;
373 unsigned long rb_sc_last; 370 unsigned long rb_sc_last;
374 struct rpcrdma_sendctx **rb_sc_ctxs; 371 struct rpcrdma_sendctx **rb_sc_ctxs;
375 372
376 spinlock_t rb_lock; /* protect buf lists */
377 struct list_head rb_send_bufs;
378 struct list_head rb_recv_bufs;
379 struct list_head rb_allreqs; 373 struct list_head rb_allreqs;
374 struct list_head rb_all_mrs;
375
376 struct llist_head rb_free_reps;
380 377
381 u32 rb_max_requests; 378 u32 rb_max_requests;
382 u32 rb_credits; /* most recent credit grant */ 379 u32 rb_credits; /* most recent credit grant */
@@ -384,7 +381,7 @@ struct rpcrdma_buffer {
384 u32 rb_bc_srv_max_requests; 381 u32 rb_bc_srv_max_requests;
385 u32 rb_bc_max_requests; 382 u32 rb_bc_max_requests;
386 383
387 struct delayed_work rb_refresh_worker; 384 struct work_struct rb_refresh_worker;
388}; 385};
389 386
390/* 387/*
@@ -490,7 +487,6 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
490 487
491struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); 488struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
492void rpcrdma_mr_put(struct rpcrdma_mr *mr); 489void rpcrdma_mr_put(struct rpcrdma_mr *mr);
493void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
494 490
495static inline void 491static inline void
496rpcrdma_mr_recycle(struct rpcrdma_mr *mr) 492rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
@@ -546,6 +542,7 @@ rpcrdma_data_dir(bool writing)
546/* Memory registration calls xprtrdma/frwr_ops.c 542/* Memory registration calls xprtrdma/frwr_ops.c
547 */ 543 */
548bool frwr_is_supported(struct ib_device *device); 544bool frwr_is_supported(struct ib_device *device);
545void frwr_recycle(struct rpcrdma_req *req);
549void frwr_reset(struct rpcrdma_req *req); 546void frwr_reset(struct rpcrdma_req *req);
550int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); 547int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
551int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); 548int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
@@ -554,7 +551,7 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt);
554struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, 551struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
555 struct rpcrdma_mr_seg *seg, 552 struct rpcrdma_mr_seg *seg,
556 int nsegs, bool writing, __be32 xid, 553 int nsegs, bool writing, __be32 xid,
557 struct rpcrdma_mr **mr); 554 struct rpcrdma_mr *mr);
558int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); 555int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
559void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); 556void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
560void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); 557void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e2176c167a57..9ac88722fa83 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -562,10 +562,14 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
562 printk(KERN_WARNING "Callback slot table overflowed\n"); 562 printk(KERN_WARNING "Callback slot table overflowed\n");
563 return -ESHUTDOWN; 563 return -ESHUTDOWN;
564 } 564 }
565 if (transport->recv.copied && !req->rq_private_buf.len)
566 return -ESHUTDOWN;
565 567
566 ret = xs_read_stream_request(transport, msg, flags, req); 568 ret = xs_read_stream_request(transport, msg, flags, req);
567 if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) 569 if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
568 xprt_complete_bc_request(req, transport->recv.copied); 570 xprt_complete_bc_request(req, transport->recv.copied);
571 else
572 req->rq_private_buf.len = transport->recv.copied;
569 573
570 return ret; 574 return ret;
571} 575}
@@ -587,7 +591,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
587 /* Look up and lock the request corresponding to the given XID */ 591 /* Look up and lock the request corresponding to the given XID */
588 spin_lock(&xprt->queue_lock); 592 spin_lock(&xprt->queue_lock);
589 req = xprt_lookup_rqst(xprt, transport->recv.xid); 593 req = xprt_lookup_rqst(xprt, transport->recv.xid);
590 if (!req) { 594 if (!req || (transport->recv.copied && !req->rq_private_buf.len)) {
591 msg->msg_flags |= MSG_TRUNC; 595 msg->msg_flags |= MSG_TRUNC;
592 goto out; 596 goto out;
593 } 597 }
@@ -599,6 +603,8 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
599 spin_lock(&xprt->queue_lock); 603 spin_lock(&xprt->queue_lock);
600 if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) 604 if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
601 xprt_complete_rqst(req->rq_task, transport->recv.copied); 605 xprt_complete_rqst(req->rq_task, transport->recv.copied);
606 else
607 req->rq_private_buf.len = transport->recv.copied;
602 xprt_unpin_rqst(req); 608 xprt_unpin_rqst(req);
603out: 609out:
604 spin_unlock(&xprt->queue_lock); 610 spin_unlock(&xprt->queue_lock);