aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 20:14:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 20:14:54 -0500
commit6f83e5bd3e96228ee0caff0b103addb5f4e95459 (patch)
treebf27fcc754ea8c5287088df2803edbfb3c70f716
parent73b4f63aebd6d57db4ca1d31fa6f8516651207b0 (diff)
parentc627d31ba0696cbd829437af2be2f2dee3546b1e (diff)
Merge tag 'nfs-for-3.20-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights incluse: Features: - Removing the forced serialisation of open()/close() calls in NFSv4.x (x>0) makes for a significant performance improvement in metadata intensive workloads. - Full support for the pNFS "flexible files" layout type - Further RPC/RDMA client improvements from Chuck Bugfixes: - Stable fix: NFSv4.1 backchannel calls blocking operations with !TASK_RUNNING - Stable fix: pnfs_generic_pg_init_read/write can be called with lseg == NULL - Stable fix: Fix an Oopsable condition when nsm_mon_unmon is called as part of the namespace cleanup, - Stable fix: Ensure we reference the inode for return-on-close in delegreturn - Use SO_REUSEPORT to ensure that NFSv3 TCP connections can rebind to the same source address/port combination during a disconnect/ reconnect event. This is a requirement imposed by most NFSv3 server duplicate reply cache implementations. Optimisations: - Ask for no NFSv4.1 delegations on OPEN if using O_DIRECT Other: - Add Anna Schumaker as co-maintainer for the NFS client" * tag 'nfs-for-3.20-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (119 commits) SUNRPC: Cleanup to remove xs_tcp_close() pnfs: delete an unintended goto pnfs/flexfiles: Do not dprintk after the free SUNRPC: Fix stupid typo in xs_sock_set_reuseport SUNRPC: Define xs_tcp_fin_timeout only if CONFIG_SUNRPC_DEBUG SUNRPC: Handle connection reset more efficiently. SUNRPC: Remove the redundant XPRT_CONNECTION_CLOSE flag SUNRPC: Make xs_tcp_close() do a socket shutdown rather than a sock_release SUNRPC: Ensure xs_tcp_shutdown() requests a full close of the connection SUNRPC: Cleanup to remove remaining uses of XPRT_CONNECTION_ABORT SUNRPC: Remove TCP socket linger code SUNRPC: Remove TCP client connection reset hack SUNRPC: TCP/UDP always close the old socket before reconnecting SUNRPC: Add helpers to prevent socket create from racing SUNRPC: Ensure xs_reset_transport() resets the close connection flags SUNRPC: Do not clear the source port in xs_reset_transport SUNRPC: Handle EADDRINUSE on connect SUNRPC: Set SO_REUSEPORT socket option for TCP connections NFSv4.1: Fix pnfs_put_lseg races NFSv4.1: pnfs_send_layoutreturn should use GFP_NOFS ...
-rw-r--r--Documentation/filesystems/nfs/pnfs.txt13
-rw-r--r--MAINTAINERS1
-rw-r--r--fs/lockd/mon.c13
-rw-r--r--fs/nfs/Kconfig5
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.c8
-rw-r--r--fs/nfs/delegation.c20
-rw-r--r--fs/nfs/direct.c112
-rw-r--r--fs/nfs/filelayout/filelayout.c315
-rw-r--r--fs/nfs/filelayout/filelayout.h40
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c469
-rw-r--r--fs/nfs/flexfilelayout/Makefile5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c1574
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h155
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c552
-rw-r--r--fs/nfs/idmap.c3
-rw-r--r--fs/nfs/inode.c7
-rw-r--r--fs/nfs/internal.h53
-rw-r--r--fs/nfs/nfs2xdr.c10
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3client.c41
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs3super.c2
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h9
-rw-r--r--fs/nfs/nfs4client.c7
-rw-r--r--fs/nfs/nfs4proc.c315
-rw-r--r--fs/nfs/nfs4state.c31
-rw-r--r--fs/nfs/nfs4super.c3
-rw-r--r--fs/nfs/nfs4xdr.c113
-rw-r--r--fs/nfs/nfsroot.c4
-rw-r--r--fs/nfs/objlayout/objio_osd.c5
-rw-r--r--fs/nfs/pagelist.c294
-rw-r--r--fs/nfs/pnfs.c471
-rw-r--r--fs/nfs/pnfs.h135
-rw-r--r--fs/nfs/pnfs_nfs.c840
-rw-r--r--fs/nfs/read.c33
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs/write.c52
-rw-r--r--include/linux/nfs4.h1
-rw-r--r--include/linux/nfs_fs_sb.h9
-rw-r--r--include/linux/nfs_idmap.h2
-rw-r--r--include/linux/nfs_page.h22
-rw-r--r--include/linux/nfs_xdr.h19
-rw-r--r--include/linux/sunrpc/clnt.h3
-rw-r--r--include/linux/sunrpc/metrics.h4
-rw-r--r--include/linux/sunrpc/rpc_rdma.h14
-rw-r--r--include/linux/sunrpc/svc_rdma.h2
-rw-r--r--include/linux/sunrpc/xprt.h6
-rw-r--r--net/sunrpc/clnt.c15
-rw-r--r--net/sunrpc/rpcb_clnt.c8
-rw-r--r--net/sunrpc/sched.c7
-rw-r--r--net/sunrpc/stats.c26
-rw-r--r--net/sunrpc/xprt.c38
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c108
-rw-r--r--net/sunrpc/xprtrdma/transport.c182
-rw-r--r--net/sunrpc/xprtrdma/verbs.c411
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h112
-rw-r--r--net/sunrpc/xprtsock.c238
60 files changed, 5236 insertions, 1729 deletions
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index adc81a35fe2d..44a9f2493a88 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -57,15 +57,16 @@ bit is set, preventing any new lsegs from being added.
57layout drivers 57layout drivers
58-------------- 58--------------
59 59
60PNFS utilizes what is called layout drivers. The STD defines 3 basic 60PNFS utilizes what is called layout drivers. The STD defines 4 basic
61layout types: "files" "objects" and "blocks". For each of these types 61layout types: "files", "objects", "blocks", and "flexfiles". For each
62there is a layout-driver with a common function-vectors table which 62of these types there is a layout-driver with a common function-vectors
63are called by the nfs-client pnfs-core to implement the different layout 63table which are called by the nfs-client pnfs-core to implement the
64types. 64different layout types.
65 65
66Files-layout-driver code is in: fs/nfs/nfs4filelayout.c && nfs4filelayoutdev.c 66Files-layout-driver code is in: fs/nfs/filelayout/.. directory
67Objects-layout-deriver code is in: fs/nfs/objlayout/.. directory 67Objects-layout-deriver code is in: fs/nfs/objlayout/.. directory
68Blocks-layout-deriver code is in: fs/nfs/blocklayout/.. directory 68Blocks-layout-deriver code is in: fs/nfs/blocklayout/.. directory
69Flexfiles-layout-driver code is in: fs/nfs/flexfilelayout/.. directory
69 70
70objects-layout setup 71objects-layout setup
71-------------------- 72--------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index cc66549dd761..37c10098a627 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6777,6 +6777,7 @@ F: Documentation/devicetree/bindings/net/nfc/
6777 6777
6778NFS, SUNRPC, AND LOCKD CLIENTS 6778NFS, SUNRPC, AND LOCKD CLIENTS
6779M: Trond Myklebust <trond.myklebust@primarydata.com> 6779M: Trond Myklebust <trond.myklebust@primarydata.com>
6780M: Anna Schumaker <anna.schumaker@netapp.com>
6780L: linux-nfs@vger.kernel.org 6781L: linux-nfs@vger.kernel.org
6781W: http://client.linux-nfs.org 6782W: http://client.linux-nfs.org
6782T: git git://git.linux-nfs.org/projects/trondmy/linux-nfs.git 6783T: git git://git.linux-nfs.org/projects/trondmy/linux-nfs.git
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1cc6ec51e6b1..47a32b6d9b90 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
65 return (struct sockaddr *)&nsm->sm_addr; 65 return (struct sockaddr *)&nsm->sm_addr;
66} 66}
67 67
68static struct rpc_clnt *nsm_create(struct net *net) 68static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
69{ 69{
70 struct sockaddr_in sin = { 70 struct sockaddr_in sin = {
71 .sin_family = AF_INET, 71 .sin_family = AF_INET,
@@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
77 .address = (struct sockaddr *)&sin, 77 .address = (struct sockaddr *)&sin,
78 .addrsize = sizeof(sin), 78 .addrsize = sizeof(sin),
79 .servername = "rpc.statd", 79 .servername = "rpc.statd",
80 .nodename = nodename,
80 .program = &nsm_program, 81 .program = &nsm_program,
81 .version = NSM_VERSION, 82 .version = NSM_VERSION,
82 .authflavor = RPC_AUTH_NULL, 83 .authflavor = RPC_AUTH_NULL,
@@ -102,7 +103,7 @@ out:
102 return clnt; 103 return clnt;
103} 104}
104 105
105static struct rpc_clnt *nsm_client_get(struct net *net) 106static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
106{ 107{
107 struct rpc_clnt *clnt, *new; 108 struct rpc_clnt *clnt, *new;
108 struct lockd_net *ln = net_generic(net, lockd_net_id); 109 struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net)
111 if (clnt != NULL) 112 if (clnt != NULL)
112 goto out; 113 goto out;
113 114
114 clnt = new = nsm_create(net); 115 clnt = new = nsm_create(net, nodename);
115 if (IS_ERR(clnt)) 116 if (IS_ERR(clnt))
116 goto out; 117 goto out;
117 118
@@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host)
190 struct nsm_res res; 191 struct nsm_res res;
191 int status; 192 int status;
192 struct rpc_clnt *clnt; 193 struct rpc_clnt *clnt;
194 const char *nodename = NULL;
193 195
194 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); 196 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
195 197
196 if (nsm->sm_monitored) 198 if (nsm->sm_monitored)
197 return 0; 199 return 0;
198 200
201 if (host->h_rpcclnt)
202 nodename = host->h_rpcclnt->cl_nodename;
203
199 /* 204 /*
200 * Choose whether to record the caller_name or IP address of 205 * Choose whether to record the caller_name or IP address of
201 * this peer in the local rpc.statd's database. 206 * this peer in the local rpc.statd's database.
202 */ 207 */
203 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; 208 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
204 209
205 clnt = nsm_client_get(host->net); 210 clnt = nsm_client_get(host->net, nodename);
206 if (IS_ERR(clnt)) { 211 if (IS_ERR(clnt)) {
207 status = PTR_ERR(clnt); 212 status = PTR_ERR(clnt);
208 dprintk("lockd: failed to create NSM upcall transport, " 213 dprintk("lockd: failed to create NSM upcall transport, "
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3dece03f2fc8..c7abc10279af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT
128 depends on NFS_V4_1 && SCSI_OSD_ULD 128 depends on NFS_V4_1 && SCSI_OSD_ULD
129 default NFS_V4 129 default NFS_V4
130 130
131config PNFS_FLEXFILE_LAYOUT
132 tristate
133 depends on NFS_V4_1 && NFS_V3
134 default m
135
131config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN 136config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
132 string "NFSv4.1 Implementation ID Domain" 137 string "NFSv4.1 Implementation ID Domain"
133 depends on NFS_V4_1 138 depends on NFS_V4_1
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 04cb830fa09f..1e987acf20c9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -27,9 +27,10 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
27 dns_resolve.o nfs4trace.o 27 dns_resolve.o nfs4trace.o
28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o 28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o 30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
31nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o 31nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
32 32
33obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ 33obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
34obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 34obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
35obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ 35obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
36obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 77fec6a55f57..1cac3c175d18 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -860,12 +860,14 @@ static const struct nfs_pageio_ops bl_pg_read_ops = {
860 .pg_init = bl_pg_init_read, 860 .pg_init = bl_pg_init_read,
861 .pg_test = bl_pg_test_read, 861 .pg_test = bl_pg_test_read,
862 .pg_doio = pnfs_generic_pg_readpages, 862 .pg_doio = pnfs_generic_pg_readpages,
863 .pg_cleanup = pnfs_generic_pg_cleanup,
863}; 864};
864 865
865static const struct nfs_pageio_ops bl_pg_write_ops = { 866static const struct nfs_pageio_ops bl_pg_write_ops = {
866 .pg_init = bl_pg_init_write, 867 .pg_init = bl_pg_init_write,
867 .pg_test = bl_pg_test_write, 868 .pg_test = bl_pg_test_write,
868 .pg_doio = pnfs_generic_pg_writepages, 869 .pg_doio = pnfs_generic_pg_writepages,
870 .pg_cleanup = pnfs_generic_pg_cleanup,
869}; 871};
870 872
871static struct pnfs_layoutdriver_type blocklayout_type = { 873static struct pnfs_layoutdriver_type blocklayout_type = {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index b8fb3a4ef649..351be9205bf8 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp)
128 if (try_to_freeze()) 128 if (try_to_freeze())
129 continue; 129 continue;
130 130
131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); 131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
132 spin_lock_bh(&serv->sv_cb_lock); 132 spin_lock_bh(&serv->sv_cb_lock);
133 if (!list_empty(&serv->sv_cb_list)) { 133 if (!list_empty(&serv->sv_cb_list)) {
134 req = list_first_entry(&serv->sv_cb_list, 134 req = list_first_entry(&serv->sv_cb_list,
135 struct rpc_rqst, rq_bc_list); 135 struct rpc_rqst, rq_bc_list);
136 list_del(&req->rq_bc_list); 136 list_del(&req->rq_bc_list);
137 spin_unlock_bh(&serv->sv_cb_lock); 137 spin_unlock_bh(&serv->sv_cb_lock);
138 finish_wait(&serv->sv_cb_waitq, &wq);
138 dprintk("Invoking bc_svc_process()\n"); 139 dprintk("Invoking bc_svc_process()\n");
139 error = bc_svc_process(serv, req, rqstp); 140 error = bc_svc_process(serv, req, rqstp);
140 dprintk("bc_svc_process() returned w/ error code= %d\n", 141 dprintk("bc_svc_process() returned w/ error code= %d\n",
141 error); 142 error);
142 } else { 143 } else {
143 spin_unlock_bh(&serv->sv_cb_lock); 144 spin_unlock_bh(&serv->sv_cb_lock);
144 schedule(); 145 /* schedule_timeout to game the hung task watchdog */
146 schedule_timeout(60 * HZ);
147 finish_wait(&serv->sv_cb_waitq, &wq);
145 } 148 }
146 finish_wait(&serv->sv_cb_waitq, &wq);
147 } 149 }
148 return 0; 150 return 0;
149} 151}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 8cdb2b28a104..da5433230bb1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -306,6 +306,17 @@ nfs_inode_detach_delegation(struct inode *inode)
306 return nfs_detach_delegation(nfsi, delegation, server); 306 return nfs_detach_delegation(nfsi, delegation, server);
307} 307}
308 308
309static void
310nfs_update_inplace_delegation(struct nfs_delegation *delegation,
311 const struct nfs_delegation *update)
312{
313 if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
314 delegation->stateid.seqid = update->stateid.seqid;
315 smp_wmb();
316 delegation->type = update->type;
317 }
318}
319
309/** 320/**
310 * nfs_inode_set_delegation - set up a delegation on an inode 321 * nfs_inode_set_delegation - set up a delegation on an inode
311 * @inode: inode to which delegation applies 322 * @inode: inode to which delegation applies
@@ -339,9 +350,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
339 old_delegation = rcu_dereference_protected(nfsi->delegation, 350 old_delegation = rcu_dereference_protected(nfsi->delegation,
340 lockdep_is_held(&clp->cl_lock)); 351 lockdep_is_held(&clp->cl_lock));
341 if (old_delegation != NULL) { 352 if (old_delegation != NULL) {
342 if (nfs4_stateid_match(&delegation->stateid, 353 /* Is this an update of the existing delegation? */
343 &old_delegation->stateid) && 354 if (nfs4_stateid_match_other(&old_delegation->stateid,
344 delegation->type == old_delegation->type) { 355 &delegation->stateid)) {
356 nfs_update_inplace_delegation(old_delegation,
357 delegation);
358 nfsi->delegation_state = old_delegation->type;
345 goto out; 359 goto out;
346 } 360 }
347 /* 361 /*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 294692ff83b1..7077521acdf4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -66,6 +66,10 @@ static struct kmem_cache *nfs_direct_cachep;
66/* 66/*
67 * This represents a set of asynchronous requests that we're waiting on 67 * This represents a set of asynchronous requests that we're waiting on
68 */ 68 */
69struct nfs_direct_mirror {
70 ssize_t count;
71};
72
69struct nfs_direct_req { 73struct nfs_direct_req {
70 struct kref kref; /* release manager */ 74 struct kref kref; /* release manager */
71 75
@@ -78,8 +82,13 @@ struct nfs_direct_req {
78 /* completion state */ 82 /* completion state */
79 atomic_t io_count; /* i/os we're waiting for */ 83 atomic_t io_count; /* i/os we're waiting for */
80 spinlock_t lock; /* protect completion state */ 84 spinlock_t lock; /* protect completion state */
85
86 struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
87 int mirror_count;
88
81 ssize_t count, /* bytes actually processed */ 89 ssize_t count, /* bytes actually processed */
82 bytes_left, /* bytes left to be sent */ 90 bytes_left, /* bytes left to be sent */
91 io_start, /* start of IO */
83 error; /* any reported error */ 92 error; /* any reported error */
84 struct completion completion; /* wait for i/o completion */ 93 struct completion completion; /* wait for i/o completion */
85 94
@@ -108,26 +117,56 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
108 return atomic_dec_and_test(&dreq->io_count); 117 return atomic_dec_and_test(&dreq->io_count);
109} 118}
110 119
120void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
121{
122 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
123}
124EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
125
126static void
127nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
128{
129 int i;
130 ssize_t count;
131
132 WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
133
134 count = dreq->mirrors[hdr->pgio_mirror_idx].count;
135 if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
136 count = hdr->io_start + hdr->good_bytes - dreq->io_start;
137 dreq->mirrors[hdr->pgio_mirror_idx].count = count;
138 }
139
140 /* update the dreq->count by finding the minimum agreed count from all
141 * mirrors */
142 count = dreq->mirrors[0].count;
143
144 for (i = 1; i < dreq->mirror_count; i++)
145 count = min(count, dreq->mirrors[i].count);
146
147 dreq->count = count;
148}
149
111/* 150/*
112 * nfs_direct_select_verf - select the right verifier 151 * nfs_direct_select_verf - select the right verifier
113 * @dreq - direct request possibly spanning multiple servers 152 * @dreq - direct request possibly spanning multiple servers
114 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs 153 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
115 * @ds_idx - index of data server in data server list, only valid if ds_clp set 154 * @commit_idx - commit bucket index for the DS
116 * 155 *
117 * returns the correct verifier to use given the role of the server 156 * returns the correct verifier to use given the role of the server
118 */ 157 */
119static struct nfs_writeverf * 158static struct nfs_writeverf *
120nfs_direct_select_verf(struct nfs_direct_req *dreq, 159nfs_direct_select_verf(struct nfs_direct_req *dreq,
121 struct nfs_client *ds_clp, 160 struct nfs_client *ds_clp,
122 int ds_idx) 161 int commit_idx)
123{ 162{
124 struct nfs_writeverf *verfp = &dreq->verf; 163 struct nfs_writeverf *verfp = &dreq->verf;
125 164
126#ifdef CONFIG_NFS_V4_1 165#ifdef CONFIG_NFS_V4_1
127 if (ds_clp) { 166 if (ds_clp) {
128 /* pNFS is in use, use the DS verf */ 167 /* pNFS is in use, use the DS verf */
129 if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) 168 if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
130 verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; 169 verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
131 else 170 else
132 WARN_ON_ONCE(1); 171 WARN_ON_ONCE(1);
133 } 172 }
@@ -148,8 +187,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
148{ 187{
149 struct nfs_writeverf *verfp; 188 struct nfs_writeverf *verfp;
150 189
151 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, 190 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
152 hdr->ds_idx);
153 WARN_ON_ONCE(verfp->committed >= 0); 191 WARN_ON_ONCE(verfp->committed >= 0);
154 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 192 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
155 WARN_ON_ONCE(verfp->committed < 0); 193 WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +207,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
169{ 207{
170 struct nfs_writeverf *verfp; 208 struct nfs_writeverf *verfp;
171 209
172 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, 210 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
173 hdr->ds_idx);
174 if (verfp->committed < 0) { 211 if (verfp->committed < 0) {
175 nfs_direct_set_hdr_verf(dreq, hdr); 212 nfs_direct_set_hdr_verf(dreq, hdr);
176 return 0; 213 return 0;
@@ -193,7 +230,11 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
193 230
194 verfp = nfs_direct_select_verf(dreq, data->ds_clp, 231 verfp = nfs_direct_select_verf(dreq, data->ds_clp,
195 data->ds_commit_index); 232 data->ds_commit_index);
196 WARN_ON_ONCE(verfp->committed < 0); 233
234 /* verifier not set so always fail */
235 if (verfp->committed < 0)
236 return 1;
237
197 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 238 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
198} 239}
199 240
@@ -249,6 +290,18 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
249 cinfo->completion_ops = &nfs_direct_commit_completion_ops; 290 cinfo->completion_ops = &nfs_direct_commit_completion_ops;
250} 291}
251 292
293static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
294 struct nfs_pageio_descriptor *pgio,
295 struct nfs_page *req)
296{
297 int mirror_count = 1;
298
299 if (pgio->pg_ops->pg_get_mirror_count)
300 mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
301
302 dreq->mirror_count = mirror_count;
303}
304
252static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 305static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
253{ 306{
254 struct nfs_direct_req *dreq; 307 struct nfs_direct_req *dreq;
@@ -263,6 +316,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
263 INIT_LIST_HEAD(&dreq->mds_cinfo.list); 316 INIT_LIST_HEAD(&dreq->mds_cinfo.list);
264 dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ 317 dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
265 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); 318 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
319 dreq->mirror_count = 1;
266 spin_lock_init(&dreq->lock); 320 spin_lock_init(&dreq->lock);
267 321
268 return dreq; 322 return dreq;
@@ -369,7 +423,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
369 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) 423 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
370 dreq->error = hdr->error; 424 dreq->error = hdr->error;
371 else 425 else
372 dreq->count += hdr->good_bytes; 426 nfs_direct_good_bytes(dreq, hdr);
427
373 spin_unlock(&dreq->lock); 428 spin_unlock(&dreq->lock);
374 429
375 while (!list_empty(&hdr->pages)) { 430 while (!list_empty(&hdr->pages)) {
@@ -547,6 +602,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
547 602
548 dreq->inode = inode; 603 dreq->inode = inode;
549 dreq->bytes_left = count; 604 dreq->bytes_left = count;
605 dreq->io_start = pos;
550 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 606 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
551 l_ctx = nfs_get_lock_context(dreq->ctx); 607 l_ctx = nfs_get_lock_context(dreq->ctx);
552 if (IS_ERR(l_ctx)) { 608 if (IS_ERR(l_ctx)) {
@@ -579,6 +635,20 @@ out:
579 return result; 635 return result;
580} 636}
581 637
638static void
639nfs_direct_write_scan_commit_list(struct inode *inode,
640 struct list_head *list,
641 struct nfs_commit_info *cinfo)
642{
643 spin_lock(cinfo->lock);
644#ifdef CONFIG_NFS_V4_1
645 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
646 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
647#endif
648 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
649 spin_unlock(cinfo->lock);
650}
651
582static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 652static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
583{ 653{
584 struct nfs_pageio_descriptor desc; 654 struct nfs_pageio_descriptor desc;
@@ -586,20 +656,23 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
586 LIST_HEAD(reqs); 656 LIST_HEAD(reqs);
587 struct nfs_commit_info cinfo; 657 struct nfs_commit_info cinfo;
588 LIST_HEAD(failed); 658 LIST_HEAD(failed);
659 int i;
589 660
590 nfs_init_cinfo_from_dreq(&cinfo, dreq); 661 nfs_init_cinfo_from_dreq(&cinfo, dreq);
591 pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); 662 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
592 spin_lock(cinfo.lock);
593 nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
594 spin_unlock(cinfo.lock);
595 663
596 dreq->count = 0; 664 dreq->count = 0;
665 for (i = 0; i < dreq->mirror_count; i++)
666 dreq->mirrors[i].count = 0;
597 get_dreq(dreq); 667 get_dreq(dreq);
598 668
599 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, 669 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
600 &nfs_direct_write_completion_ops); 670 &nfs_direct_write_completion_ops);
601 desc.pg_dreq = dreq; 671 desc.pg_dreq = dreq;
602 672
673 req = nfs_list_entry(reqs.next);
674 nfs_direct_setup_mirroring(dreq, &desc, req);
675
603 list_for_each_entry_safe(req, tmp, &reqs, wb_list) { 676 list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
604 if (!nfs_pageio_add_request(&desc, req)) { 677 if (!nfs_pageio_add_request(&desc, req)) {
605 nfs_list_remove_request(req); 678 nfs_list_remove_request(req);
@@ -646,7 +719,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
646 nfs_list_remove_request(req); 719 nfs_list_remove_request(req);
647 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { 720 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
648 /* Note the rewrite will go through mds */ 721 /* Note the rewrite will go through mds */
649 nfs_mark_request_commit(req, NULL, &cinfo); 722 nfs_mark_request_commit(req, NULL, &cinfo, 0);
650 } else 723 } else
651 nfs_release_request(req); 724 nfs_release_request(req);
652 nfs_unlock_and_release_request(req); 725 nfs_unlock_and_release_request(req);
@@ -721,7 +794,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
721 dreq->error = hdr->error; 794 dreq->error = hdr->error;
722 } 795 }
723 if (dreq->error == 0) { 796 if (dreq->error == 0) {
724 dreq->count += hdr->good_bytes; 797 nfs_direct_good_bytes(dreq, hdr);
725 if (nfs_write_need_commit(hdr)) { 798 if (nfs_write_need_commit(hdr)) {
726 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) 799 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
727 request_commit = true; 800 request_commit = true;
@@ -745,7 +818,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
745 nfs_list_remove_request(req); 818 nfs_list_remove_request(req);
746 if (request_commit) { 819 if (request_commit) {
747 kref_get(&req->wb_kref); 820 kref_get(&req->wb_kref);
748 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 821 nfs_mark_request_commit(req, hdr->lseg, &cinfo,
822 hdr->ds_commit_idx);
749 } 823 }
750 nfs_unlock_and_release_request(req); 824 nfs_unlock_and_release_request(req);
751 } 825 }
@@ -826,6 +900,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
826 result = PTR_ERR(req); 900 result = PTR_ERR(req);
827 break; 901 break;
828 } 902 }
903
904 nfs_direct_setup_mirroring(dreq, &desc, req);
905
829 nfs_lock_request(req); 906 nfs_lock_request(req);
830 req->wb_index = pos >> PAGE_SHIFT; 907 req->wb_index = pos >> PAGE_SHIFT;
831 req->wb_offset = pos & ~PAGE_MASK; 908 req->wb_offset = pos & ~PAGE_MASK;
@@ -934,6 +1011,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
934 1011
935 dreq->inode = inode; 1012 dreq->inode = inode;
936 dreq->bytes_left = count; 1013 dreq->bytes_left = count;
1014 dreq->io_start = pos;
937 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 1015 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
938 l_ctx = nfs_get_lock_context(dreq->ctx); 1016 l_ctx = nfs_get_lock_context(dreq->ctx);
939 if (IS_ERR(l_ctx)) { 1017 if (IS_ERR(l_ctx)) {
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..3c9769441f36 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -118,13 +118,6 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
118 } 118 }
119} 119}
120 120
121static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
122{
123 if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
124 return;
125 pnfs_return_layout(inode);
126}
127
128static int filelayout_async_handle_error(struct rpc_task *task, 121static int filelayout_async_handle_error(struct rpc_task *task,
129 struct nfs4_state *state, 122 struct nfs4_state *state,
130 struct nfs_client *clp, 123 struct nfs_client *clp,
@@ -207,7 +200,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
207 dprintk("%s DS connection error %d\n", __func__, 200 dprintk("%s DS connection error %d\n", __func__,
208 task->tk_status); 201 task->tk_status);
209 nfs4_mark_deviceid_unavailable(devid); 202 nfs4_mark_deviceid_unavailable(devid);
210 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); 203 pnfs_error_mark_layout_for_return(inode, lseg);
211 rpc_wake_up(&tbl->slot_tbl_waitq); 204 rpc_wake_up(&tbl->slot_tbl_waitq);
212 /* fall through */ 205 /* fall through */
213 default: 206 default:
@@ -339,16 +332,6 @@ static void filelayout_read_count_stats(struct rpc_task *task, void *data)
339 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); 332 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
340} 333}
341 334
342static void filelayout_read_release(void *data)
343{
344 struct nfs_pgio_header *hdr = data;
345 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
346
347 filelayout_fenceme(lo->plh_inode, lo);
348 nfs_put_client(hdr->ds_clp);
349 hdr->mds_ops->rpc_release(data);
350}
351
352static int filelayout_write_done_cb(struct rpc_task *task, 335static int filelayout_write_done_cb(struct rpc_task *task,
353 struct nfs_pgio_header *hdr) 336 struct nfs_pgio_header *hdr)
354{ 337{
@@ -371,17 +354,6 @@ static int filelayout_write_done_cb(struct rpc_task *task,
371 return 0; 354 return 0;
372} 355}
373 356
374/* Fake up some data that will cause nfs_commit_release to retry the writes. */
375static void prepare_to_resend_writes(struct nfs_commit_data *data)
376{
377 struct nfs_page *first = nfs_list_entry(data->pages.next);
378
379 data->task.tk_status = 0;
380 memcpy(&data->verf.verifier, &first->wb_verf,
381 sizeof(data->verf.verifier));
382 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
383}
384
385static int filelayout_commit_done_cb(struct rpc_task *task, 357static int filelayout_commit_done_cb(struct rpc_task *task,
386 struct nfs_commit_data *data) 358 struct nfs_commit_data *data)
387{ 359{
@@ -393,7 +365,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
393 365
394 switch (err) { 366 switch (err) {
395 case -NFS4ERR_RESET_TO_MDS: 367 case -NFS4ERR_RESET_TO_MDS:
396 prepare_to_resend_writes(data); 368 pnfs_generic_prepare_to_resend_writes(data);
397 return -EAGAIN; 369 return -EAGAIN;
398 case -EAGAIN: 370 case -EAGAIN:
399 rpc_restart_call_prepare(task); 371 rpc_restart_call_prepare(task);
@@ -451,16 +423,6 @@ static void filelayout_write_count_stats(struct rpc_task *task, void *data)
451 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); 423 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
452} 424}
453 425
454static void filelayout_write_release(void *data)
455{
456 struct nfs_pgio_header *hdr = data;
457 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
458
459 filelayout_fenceme(lo->plh_inode, lo);
460 nfs_put_client(hdr->ds_clp);
461 hdr->mds_ops->rpc_release(data);
462}
463
464static void filelayout_commit_prepare(struct rpc_task *task, void *data) 426static void filelayout_commit_prepare(struct rpc_task *task, void *data)
465{ 427{
466 struct nfs_commit_data *wdata = data; 428 struct nfs_commit_data *wdata = data;
@@ -471,14 +433,6 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
471 task); 433 task);
472} 434}
473 435
474static void filelayout_write_commit_done(struct rpc_task *task, void *data)
475{
476 struct nfs_commit_data *wdata = data;
477
478 /* Note this may cause RPC to be resent */
479 wdata->mds_ops->rpc_call_done(task, data);
480}
481
482static void filelayout_commit_count_stats(struct rpc_task *task, void *data) 436static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
483{ 437{
484 struct nfs_commit_data *cdata = data; 438 struct nfs_commit_data *cdata = data;
@@ -486,35 +440,25 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
486 rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); 440 rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
487} 441}
488 442
489static void filelayout_commit_release(void *calldata)
490{
491 struct nfs_commit_data *data = calldata;
492
493 data->completion_ops->completion(data);
494 pnfs_put_lseg(data->lseg);
495 nfs_put_client(data->ds_clp);
496 nfs_commitdata_release(data);
497}
498
499static const struct rpc_call_ops filelayout_read_call_ops = { 443static const struct rpc_call_ops filelayout_read_call_ops = {
500 .rpc_call_prepare = filelayout_read_prepare, 444 .rpc_call_prepare = filelayout_read_prepare,
501 .rpc_call_done = filelayout_read_call_done, 445 .rpc_call_done = filelayout_read_call_done,
502 .rpc_count_stats = filelayout_read_count_stats, 446 .rpc_count_stats = filelayout_read_count_stats,
503 .rpc_release = filelayout_read_release, 447 .rpc_release = pnfs_generic_rw_release,
504}; 448};
505 449
506static const struct rpc_call_ops filelayout_write_call_ops = { 450static const struct rpc_call_ops filelayout_write_call_ops = {
507 .rpc_call_prepare = filelayout_write_prepare, 451 .rpc_call_prepare = filelayout_write_prepare,
508 .rpc_call_done = filelayout_write_call_done, 452 .rpc_call_done = filelayout_write_call_done,
509 .rpc_count_stats = filelayout_write_count_stats, 453 .rpc_count_stats = filelayout_write_count_stats,
510 .rpc_release = filelayout_write_release, 454 .rpc_release = pnfs_generic_rw_release,
511}; 455};
512 456
513static const struct rpc_call_ops filelayout_commit_call_ops = { 457static const struct rpc_call_ops filelayout_commit_call_ops = {
514 .rpc_call_prepare = filelayout_commit_prepare, 458 .rpc_call_prepare = filelayout_commit_prepare,
515 .rpc_call_done = filelayout_write_commit_done, 459 .rpc_call_done = pnfs_generic_write_commit_done,
516 .rpc_count_stats = filelayout_commit_count_stats, 460 .rpc_count_stats = filelayout_commit_count_stats,
517 .rpc_release = filelayout_commit_release, 461 .rpc_release = pnfs_generic_commit_release,
518}; 462};
519 463
520static enum pnfs_try_status 464static enum pnfs_try_status
@@ -548,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
548 /* No multipath support. Use first DS */ 492 /* No multipath support. Use first DS */
549 atomic_inc(&ds->ds_clp->cl_count); 493 atomic_inc(&ds->ds_clp->cl_count);
550 hdr->ds_clp = ds->ds_clp; 494 hdr->ds_clp = ds->ds_clp;
551 hdr->ds_idx = idx; 495 hdr->ds_commit_idx = idx;
552 fh = nfs4_fl_select_ds_fh(lseg, j); 496 fh = nfs4_fl_select_ds_fh(lseg, j);
553 if (fh) 497 if (fh)
554 hdr->args.fh = fh; 498 hdr->args.fh = fh;
@@ -557,8 +501,9 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
557 hdr->mds_offset = offset; 501 hdr->mds_offset = offset;
558 502
559 /* Perform an asynchronous read to ds */ 503 /* Perform an asynchronous read to ds */
560 nfs_initiate_pgio(ds_clnt, hdr, 504 nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
561 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); 505 NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
506 0, RPC_TASK_SOFTCONN);
562 return PNFS_ATTEMPTED; 507 return PNFS_ATTEMPTED;
563} 508}
564 509
@@ -591,16 +536,16 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
591 hdr->pgio_done_cb = filelayout_write_done_cb; 536 hdr->pgio_done_cb = filelayout_write_done_cb;
592 atomic_inc(&ds->ds_clp->cl_count); 537 atomic_inc(&ds->ds_clp->cl_count);
593 hdr->ds_clp = ds->ds_clp; 538 hdr->ds_clp = ds->ds_clp;
594 hdr->ds_idx = idx; 539 hdr->ds_commit_idx = idx;
595 fh = nfs4_fl_select_ds_fh(lseg, j); 540 fh = nfs4_fl_select_ds_fh(lseg, j);
596 if (fh) 541 if (fh)
597 hdr->args.fh = fh; 542 hdr->args.fh = fh;
598 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); 543 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
599 544
600 /* Perform an asynchronous write */ 545 /* Perform an asynchronous write */
601 nfs_initiate_pgio(ds_clnt, hdr, 546 nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
602 &filelayout_write_call_ops, sync, 547 NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
603 RPC_TASK_SOFTCONN); 548 sync, RPC_TASK_SOFTCONN);
604 return PNFS_ATTEMPTED; 549 return PNFS_ATTEMPTED;
605} 550}
606 551
@@ -988,12 +933,14 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = {
988 .pg_init = filelayout_pg_init_read, 933 .pg_init = filelayout_pg_init_read,
989 .pg_test = filelayout_pg_test, 934 .pg_test = filelayout_pg_test,
990 .pg_doio = pnfs_generic_pg_readpages, 935 .pg_doio = pnfs_generic_pg_readpages,
936 .pg_cleanup = pnfs_generic_pg_cleanup,
991}; 937};
992 938
993static const struct nfs_pageio_ops filelayout_pg_write_ops = { 939static const struct nfs_pageio_ops filelayout_pg_write_ops = {
994 .pg_init = filelayout_pg_init_write, 940 .pg_init = filelayout_pg_init_write,
995 .pg_test = filelayout_pg_test, 941 .pg_test = filelayout_pg_test,
996 .pg_doio = pnfs_generic_pg_writepages, 942 .pg_doio = pnfs_generic_pg_writepages,
943 .pg_cleanup = pnfs_generic_pg_cleanup,
997}; 944};
998 945
999static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) 946static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
@@ -1004,37 +951,11 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
1004 return j; 951 return j;
1005} 952}
1006 953
1007/* The generic layer is about to remove the req from the commit list.
1008 * If this will make the bucket empty, it will need to put the lseg reference.
1009 * Note this is must be called holding the inode (/cinfo) lock
1010 */
1011static void
1012filelayout_clear_request_commit(struct nfs_page *req,
1013 struct nfs_commit_info *cinfo)
1014{
1015 struct pnfs_layout_segment *freeme = NULL;
1016
1017 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
1018 goto out;
1019 cinfo->ds->nwritten--;
1020 if (list_is_singular(&req->wb_list)) {
1021 struct pnfs_commit_bucket *bucket;
1022
1023 bucket = list_first_entry(&req->wb_list,
1024 struct pnfs_commit_bucket,
1025 written);
1026 freeme = bucket->wlseg;
1027 bucket->wlseg = NULL;
1028 }
1029out:
1030 nfs_request_remove_commit_list(req, cinfo);
1031 pnfs_put_lseg_locked(freeme);
1032}
1033
1034static void 954static void
1035filelayout_mark_request_commit(struct nfs_page *req, 955filelayout_mark_request_commit(struct nfs_page *req,
1036 struct pnfs_layout_segment *lseg, 956 struct pnfs_layout_segment *lseg,
1037 struct nfs_commit_info *cinfo) 957 struct nfs_commit_info *cinfo,
958 u32 ds_commit_idx)
1038 959
1039{ 960{
1040 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 961 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
@@ -1064,7 +985,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
1064 * is normally transferred to the COMMIT call and released 985 * is normally transferred to the COMMIT call and released
1065 * there. It could also be released if the last req is pulled 986 * there. It could also be released if the last req is pulled
1066 * off due to a rewrite, in which case it will be done in 987 * off due to a rewrite, in which case it will be done in
1067 * filelayout_clear_request_commit 988 * pnfs_generic_clear_request_commit
1068 */ 989 */
1069 buckets[i].wlseg = pnfs_get_lseg(lseg); 990 buckets[i].wlseg = pnfs_get_lseg(lseg);
1070 } 991 }
@@ -1138,101 +1059,15 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
1138 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); 1059 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1139 if (fh) 1060 if (fh)
1140 data->args.fh = fh; 1061 data->args.fh = fh;
1141 return nfs_initiate_commit(ds_clnt, data, 1062 return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
1142 &filelayout_commit_call_ops, how, 1063 &filelayout_commit_call_ops, how,
1143 RPC_TASK_SOFTCONN); 1064 RPC_TASK_SOFTCONN);
1144out_err: 1065out_err:
1145 prepare_to_resend_writes(data); 1066 pnfs_generic_prepare_to_resend_writes(data);
1146 filelayout_commit_release(data); 1067 pnfs_generic_commit_release(data);
1147 return -EAGAIN; 1068 return -EAGAIN;
1148} 1069}
1149 1070
1150static int
1151transfer_commit_list(struct list_head *src, struct list_head *dst,
1152 struct nfs_commit_info *cinfo, int max)
1153{
1154 struct nfs_page *req, *tmp;
1155 int ret = 0;
1156
1157 list_for_each_entry_safe(req, tmp, src, wb_list) {
1158 if (!nfs_lock_request(req))
1159 continue;
1160 kref_get(&req->wb_kref);
1161 if (cond_resched_lock(cinfo->lock))
1162 list_safe_reset_next(req, tmp, wb_list);
1163 nfs_request_remove_commit_list(req, cinfo);
1164 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1165 nfs_list_add_request(req, dst);
1166 ret++;
1167 if ((ret == max) && !cinfo->dreq)
1168 break;
1169 }
1170 return ret;
1171}
1172
1173/* Note called with cinfo->lock held. */
1174static int
1175filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
1176 struct nfs_commit_info *cinfo,
1177 int max)
1178{
1179 struct list_head *src = &bucket->written;
1180 struct list_head *dst = &bucket->committing;
1181 int ret;
1182
1183 ret = transfer_commit_list(src, dst, cinfo, max);
1184 if (ret) {
1185 cinfo->ds->nwritten -= ret;
1186 cinfo->ds->ncommitting += ret;
1187 bucket->clseg = bucket->wlseg;
1188 if (list_empty(src))
1189 bucket->wlseg = NULL;
1190 else
1191 pnfs_get_lseg(bucket->clseg);
1192 }
1193 return ret;
1194}
1195
1196/* Move reqs from written to committing lists, returning count of number moved.
1197 * Note called with cinfo->lock held.
1198 */
1199static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
1200 int max)
1201{
1202 int i, rv = 0, cnt;
1203
1204 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
1205 cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
1206 cinfo, max);
1207 max -= cnt;
1208 rv += cnt;
1209 }
1210 return rv;
1211}
1212
1213/* Pull everything off the committing lists and dump into @dst */
1214static void filelayout_recover_commit_reqs(struct list_head *dst,
1215 struct nfs_commit_info *cinfo)
1216{
1217 struct pnfs_commit_bucket *b;
1218 struct pnfs_layout_segment *freeme;
1219 int i;
1220
1221restart:
1222 spin_lock(cinfo->lock);
1223 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1224 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1225 freeme = b->wlseg;
1226 b->wlseg = NULL;
1227 spin_unlock(cinfo->lock);
1228 pnfs_put_lseg(freeme);
1229 goto restart;
1230 }
1231 }
1232 cinfo->ds->nwritten = 0;
1233 spin_unlock(cinfo->lock);
1234}
1235
1236/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest 1071/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
1237 * for @page 1072 * for @page
1238 * @cinfo - commit info for current inode 1073 * @cinfo - commit info for current inode
@@ -1263,108 +1098,14 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
1263 return NULL; 1098 return NULL;
1264} 1099}
1265 1100
1266static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
1267{
1268 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
1269 struct pnfs_commit_bucket *bucket;
1270 struct pnfs_layout_segment *freeme;
1271 int i;
1272
1273 for (i = idx; i < fl_cinfo->nbuckets; i++) {
1274 bucket = &fl_cinfo->buckets[i];
1275 if (list_empty(&bucket->committing))
1276 continue;
1277 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1278 spin_lock(cinfo->lock);
1279 freeme = bucket->clseg;
1280 bucket->clseg = NULL;
1281 spin_unlock(cinfo->lock);
1282 pnfs_put_lseg(freeme);
1283 }
1284}
1285
1286static unsigned int
1287alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1288{
1289 struct pnfs_ds_commit_info *fl_cinfo;
1290 struct pnfs_commit_bucket *bucket;
1291 struct nfs_commit_data *data;
1292 int i;
1293 unsigned int nreq = 0;
1294
1295 fl_cinfo = cinfo->ds;
1296 bucket = fl_cinfo->buckets;
1297 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
1298 if (list_empty(&bucket->committing))
1299 continue;
1300 data = nfs_commitdata_alloc();
1301 if (!data)
1302 break;
1303 data->ds_commit_index = i;
1304 spin_lock(cinfo->lock);
1305 data->lseg = bucket->clseg;
1306 bucket->clseg = NULL;
1307 spin_unlock(cinfo->lock);
1308 list_add(&data->pages, list);
1309 nreq++;
1310 }
1311
1312 /* Clean up on error */
1313 filelayout_retry_commit(cinfo, i);
1314 /* Caller will clean up entries put on list */
1315 return nreq;
1316}
1317
1318/* This follows nfs_commit_list pretty closely */
1319static int 1101static int
1320filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, 1102filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1321 int how, struct nfs_commit_info *cinfo) 1103 int how, struct nfs_commit_info *cinfo)
1322{ 1104{
1323 struct nfs_commit_data *data, *tmp; 1105 return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1324 LIST_HEAD(list); 1106 filelayout_initiate_commit);
1325 unsigned int nreq = 0;
1326
1327 if (!list_empty(mds_pages)) {
1328 data = nfs_commitdata_alloc();
1329 if (data != NULL) {
1330 data->lseg = NULL;
1331 list_add(&data->pages, &list);
1332 nreq++;
1333 } else {
1334 nfs_retry_commit(mds_pages, NULL, cinfo);
1335 filelayout_retry_commit(cinfo, 0);
1336 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1337 return -ENOMEM;
1338 }
1339 }
1340
1341 nreq += alloc_ds_commits(cinfo, &list);
1342
1343 if (nreq == 0) {
1344 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1345 goto out;
1346 }
1347
1348 atomic_add(nreq, &cinfo->mds->rpcs_out);
1349
1350 list_for_each_entry_safe(data, tmp, &list, pages) {
1351 list_del_init(&data->pages);
1352 if (!data->lseg) {
1353 nfs_init_commit(data, mds_pages, NULL, cinfo);
1354 nfs_initiate_commit(NFS_CLIENT(inode), data,
1355 data->mds_ops, how, 0);
1356 } else {
1357 struct pnfs_commit_bucket *buckets;
1358
1359 buckets = cinfo->ds->buckets;
1360 nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
1361 filelayout_initiate_commit(data, how);
1362 }
1363 }
1364out:
1365 cinfo->ds->ncommitting = 0;
1366 return PNFS_ATTEMPTED;
1367} 1107}
1108
1368static struct nfs4_deviceid_node * 1109static struct nfs4_deviceid_node *
1369filelayout_alloc_deviceid_node(struct nfs_server *server, 1110filelayout_alloc_deviceid_node(struct nfs_server *server,
1370 struct pnfs_device *pdev, gfp_t gfp_flags) 1111 struct pnfs_device *pdev, gfp_t gfp_flags)
@@ -1421,9 +1162,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1421 .pg_write_ops = &filelayout_pg_write_ops, 1162 .pg_write_ops = &filelayout_pg_write_ops,
1422 .get_ds_info = &filelayout_get_ds_info, 1163 .get_ds_info = &filelayout_get_ds_info,
1423 .mark_request_commit = filelayout_mark_request_commit, 1164 .mark_request_commit = filelayout_mark_request_commit,
1424 .clear_request_commit = filelayout_clear_request_commit, 1165 .clear_request_commit = pnfs_generic_clear_request_commit,
1425 .scan_commit_lists = filelayout_scan_commit_lists, 1166 .scan_commit_lists = pnfs_generic_scan_commit_lists,
1426 .recover_commit_reqs = filelayout_recover_commit_reqs, 1167 .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
1427 .search_commit_reqs = filelayout_search_commit_reqs, 1168 .search_commit_reqs = filelayout_search_commit_reqs,
1428 .commit_pagelist = filelayout_commit_pagelist, 1169 .commit_pagelist = filelayout_commit_pagelist,
1429 .read_pagelist = filelayout_read_pagelist, 1170 .read_pagelist = filelayout_read_pagelist,
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index 7c9f800c49d7..2896cb833a11 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -33,13 +33,6 @@
33#include "../pnfs.h" 33#include "../pnfs.h"
34 34
35/* 35/*
36 * Default data server connection timeout and retrans vaules.
37 * Set by module paramters dataserver_timeo and dataserver_retrans.
38 */
39#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
40#define NFS4_DEF_DS_RETRANS 5
41
42/*
43 * Field testing shows we need to support up to 4096 stripe indices. 36 * Field testing shows we need to support up to 4096 stripe indices.
44 * We store each index as a u8 (u32 on the wire) to keep the memory footprint 37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
45 * reasonable. This in turn means we support a maximum of 256 38 * reasonable. This in turn means we support a maximum of 256
@@ -48,32 +41,11 @@
48#define NFS4_PNFS_MAX_STRIPE_CNT 4096 41#define NFS4_PNFS_MAX_STRIPE_CNT 4096
49#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ 42#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
50 43
51/* error codes for internal use */
52#define NFS4ERR_RESET_TO_MDS 12001
53
54enum stripetype4 { 44enum stripetype4 {
55 STRIPE_SPARSE = 1, 45 STRIPE_SPARSE = 1,
56 STRIPE_DENSE = 2 46 STRIPE_DENSE = 2
57}; 47};
58 48
59/* Individual ip address */
60struct nfs4_pnfs_ds_addr {
61 struct sockaddr_storage da_addr;
62 size_t da_addrlen;
63 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
64 char *da_remotestr; /* human readable addr+port */
65};
66
67struct nfs4_pnfs_ds {
68 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
69 char *ds_remotestr; /* comma sep list of addrs */
70 struct list_head ds_addrs;
71 struct nfs_client *ds_clp;
72 atomic_t ds_count;
73 unsigned long ds_state;
74#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
75};
76
77struct nfs4_file_layout_dsaddr { 49struct nfs4_file_layout_dsaddr {
78 struct nfs4_deviceid_node id_node; 50 struct nfs4_deviceid_node id_node;
79 u32 stripe_count; 51 u32 stripe_count;
@@ -119,17 +91,6 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
119 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; 91 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
120} 92}
121 93
122static inline void
123filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
124{
125 u32 *p = (u32 *)&node->deviceid;
126
127 printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
128 p[0], p[1], p[2], p[3]);
129
130 set_bit(NFS_DEVICEID_INVALID, &node->flags);
131}
132
133static inline bool 94static inline bool
134filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) 95filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
135{ 96{
@@ -142,7 +103,6 @@ filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
142extern struct nfs_fh * 103extern struct nfs_fh *
143nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 104nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
144 105
145extern void print_ds(struct nfs4_pnfs_ds *ds);
146u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); 106u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 107u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 108struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index bfecac781f19..4f372e224603 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -31,7 +31,6 @@
31#include <linux/nfs_fs.h> 31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/sunrpc/addr.h>
35 34
36#include "../internal.h" 35#include "../internal.h"
37#include "../nfs4session.h" 36#include "../nfs4session.h"
@@ -42,183 +41,6 @@
42static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; 41static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
43static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; 42static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
44 43
45/*
46 * Data server cache
47 *
48 * Data servers can be mapped to different device ids.
49 * nfs4_pnfs_ds reference counting
50 * - set to 1 on allocation
51 * - incremented when a device id maps a data server already in the cache.
52 * - decremented when deviceid is removed from the cache.
53 */
54static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55static LIST_HEAD(nfs4_data_server_cache);
56
57/* Debug routines */
58void
59print_ds(struct nfs4_pnfs_ds *ds)
60{
61 if (ds == NULL) {
62 printk("%s NULL device\n", __func__);
63 return;
64 }
65 printk(" ds %s\n"
66 " ref count %d\n"
67 " client %p\n"
68 " cl_exchange_flags %x\n",
69 ds->ds_remotestr,
70 atomic_read(&ds->ds_count), ds->ds_clp,
71 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
72}
73
74static bool
75same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76{
77 struct sockaddr_in *a, *b;
78 struct sockaddr_in6 *a6, *b6;
79
80 if (addr1->sa_family != addr2->sa_family)
81 return false;
82
83 switch (addr1->sa_family) {
84 case AF_INET:
85 a = (struct sockaddr_in *)addr1;
86 b = (struct sockaddr_in *)addr2;
87
88 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
89 a->sin_port == b->sin_port)
90 return true;
91 break;
92
93 case AF_INET6:
94 a6 = (struct sockaddr_in6 *)addr1;
95 b6 = (struct sockaddr_in6 *)addr2;
96
97 /* LINKLOCAL addresses must have matching scope_id */
98 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 IPV6_ADDR_SCOPE_LINKLOCAL &&
100 a6->sin6_scope_id != b6->sin6_scope_id)
101 return false;
102
103 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
104 a6->sin6_port == b6->sin6_port)
105 return true;
106 break;
107
108 default:
109 dprintk("%s: unhandled address family: %u\n",
110 __func__, addr1->sa_family);
111 return false;
112 }
113
114 return false;
115}
116
117static bool
118_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
119 const struct list_head *dsaddrs2)
120{
121 struct nfs4_pnfs_ds_addr *da1, *da2;
122
123 /* step through both lists, comparing as we go */
124 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
125 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
126 da1 != NULL && da2 != NULL;
127 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
128 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
129 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
130 (struct sockaddr *)&da2->da_addr))
131 return false;
132 }
133 if (da1 == NULL && da2 == NULL)
134 return true;
135
136 return false;
137}
138
139/*
140 * Lookup DS by addresses. nfs4_ds_cache_lock is held
141 */
142static struct nfs4_pnfs_ds *
143_data_server_lookup_locked(const struct list_head *dsaddrs)
144{
145 struct nfs4_pnfs_ds *ds;
146
147 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
148 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
149 return ds;
150 return NULL;
151}
152
153/*
154 * Create an rpc connection to the nfs4_pnfs_ds data server
155 * Currently only supports IPv4 and IPv6 addresses
156 */
157static int
158nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
159{
160 struct nfs_client *clp = ERR_PTR(-EIO);
161 struct nfs4_pnfs_ds_addr *da;
162 int status = 0;
163
164 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
165 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
166
167 list_for_each_entry(da, &ds->ds_addrs, da_node) {
168 dprintk("%s: DS %s: trying address %s\n",
169 __func__, ds->ds_remotestr, da->da_remotestr);
170
171 clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 (struct sockaddr *)&da->da_addr,
173 da->da_addrlen, IPPROTO_TCP,
174 dataserver_timeo, dataserver_retrans);
175 if (!IS_ERR(clp))
176 break;
177 }
178
179 if (IS_ERR(clp)) {
180 status = PTR_ERR(clp);
181 goto out;
182 }
183
184 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
185 if (status)
186 goto out_put;
187
188 smp_wmb();
189 ds->ds_clp = clp;
190 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
191out:
192 return status;
193out_put:
194 nfs_put_client(clp);
195 goto out;
196}
197
198static void
199destroy_ds(struct nfs4_pnfs_ds *ds)
200{
201 struct nfs4_pnfs_ds_addr *da;
202
203 dprintk("--> %s\n", __func__);
204 ifdebug(FACILITY)
205 print_ds(ds);
206
207 nfs_put_client(ds->ds_clp);
208
209 while (!list_empty(&ds->ds_addrs)) {
210 da = list_first_entry(&ds->ds_addrs,
211 struct nfs4_pnfs_ds_addr,
212 da_node);
213 list_del_init(&da->da_node);
214 kfree(da->da_remotestr);
215 kfree(da);
216 }
217
218 kfree(ds->ds_remotestr);
219 kfree(ds);
220}
221
222void 44void
223nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 45nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
224{ 46{
@@ -229,259 +51,13 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
229 51
230 for (i = 0; i < dsaddr->ds_num; i++) { 52 for (i = 0; i < dsaddr->ds_num; i++) {
231 ds = dsaddr->ds_list[i]; 53 ds = dsaddr->ds_list[i];
232 if (ds != NULL) { 54 if (ds != NULL)
233 if (atomic_dec_and_lock(&ds->ds_count, 55 nfs4_pnfs_ds_put(ds);
234 &nfs4_ds_cache_lock)) {
235 list_del_init(&ds->ds_node);
236 spin_unlock(&nfs4_ds_cache_lock);
237 destroy_ds(ds);
238 }
239 }
240 } 56 }
241 kfree(dsaddr->stripe_indices); 57 kfree(dsaddr->stripe_indices);
242 kfree(dsaddr); 58 kfree(dsaddr);
243} 59}
244 60
245/*
246 * Create a string with a human readable address and port to avoid
247 * complicated setup around many dprinks.
248 */
249static char *
250nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
251{
252 struct nfs4_pnfs_ds_addr *da;
253 char *remotestr;
254 size_t len;
255 char *p;
256
257 len = 3; /* '{', '}' and eol */
258 list_for_each_entry(da, dsaddrs, da_node) {
259 len += strlen(da->da_remotestr) + 1; /* string plus comma */
260 }
261
262 remotestr = kzalloc(len, gfp_flags);
263 if (!remotestr)
264 return NULL;
265
266 p = remotestr;
267 *(p++) = '{';
268 len--;
269 list_for_each_entry(da, dsaddrs, da_node) {
270 size_t ll = strlen(da->da_remotestr);
271
272 if (ll > len)
273 goto out_err;
274
275 memcpy(p, da->da_remotestr, ll);
276 p += ll;
277 len -= ll;
278
279 if (len < 1)
280 goto out_err;
281 (*p++) = ',';
282 len--;
283 }
284 if (len < 2)
285 goto out_err;
286 *(p++) = '}';
287 *p = '\0';
288 return remotestr;
289out_err:
290 kfree(remotestr);
291 return NULL;
292}
293
294static struct nfs4_pnfs_ds *
295nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
296{
297 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
298 char *remotestr;
299
300 if (list_empty(dsaddrs)) {
301 dprintk("%s: no addresses defined\n", __func__);
302 goto out;
303 }
304
305 ds = kzalloc(sizeof(*ds), gfp_flags);
306 if (!ds)
307 goto out;
308
309 /* this is only used for debugging, so it's ok if its NULL */
310 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
311
312 spin_lock(&nfs4_ds_cache_lock);
313 tmp_ds = _data_server_lookup_locked(dsaddrs);
314 if (tmp_ds == NULL) {
315 INIT_LIST_HEAD(&ds->ds_addrs);
316 list_splice_init(dsaddrs, &ds->ds_addrs);
317 ds->ds_remotestr = remotestr;
318 atomic_set(&ds->ds_count, 1);
319 INIT_LIST_HEAD(&ds->ds_node);
320 ds->ds_clp = NULL;
321 list_add(&ds->ds_node, &nfs4_data_server_cache);
322 dprintk("%s add new data server %s\n", __func__,
323 ds->ds_remotestr);
324 } else {
325 kfree(remotestr);
326 kfree(ds);
327 atomic_inc(&tmp_ds->ds_count);
328 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
329 __func__, tmp_ds->ds_remotestr,
330 atomic_read(&tmp_ds->ds_count));
331 ds = tmp_ds;
332 }
333 spin_unlock(&nfs4_ds_cache_lock);
334out:
335 return ds;
336}
337
338/*
339 * Currently only supports ipv4, ipv6 and one multi-path address.
340 */
341static struct nfs4_pnfs_ds_addr *
342decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
343{
344 struct nfs4_pnfs_ds_addr *da = NULL;
345 char *buf, *portstr;
346 __be16 port;
347 int nlen, rlen;
348 int tmp[2];
349 __be32 *p;
350 char *netid, *match_netid;
351 size_t len, match_netid_len;
352 char *startsep = "";
353 char *endsep = "";
354
355
356 /* r_netid */
357 p = xdr_inline_decode(streamp, 4);
358 if (unlikely(!p))
359 goto out_err;
360 nlen = be32_to_cpup(p++);
361
362 p = xdr_inline_decode(streamp, nlen);
363 if (unlikely(!p))
364 goto out_err;
365
366 netid = kmalloc(nlen+1, gfp_flags);
367 if (unlikely(!netid))
368 goto out_err;
369
370 netid[nlen] = '\0';
371 memcpy(netid, p, nlen);
372
373 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
374 p = xdr_inline_decode(streamp, 4);
375 if (unlikely(!p))
376 goto out_free_netid;
377 rlen = be32_to_cpup(p);
378
379 p = xdr_inline_decode(streamp, rlen);
380 if (unlikely(!p))
381 goto out_free_netid;
382
383 /* port is ".ABC.DEF", 8 chars max */
384 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
385 dprintk("%s: Invalid address, length %d\n", __func__,
386 rlen);
387 goto out_free_netid;
388 }
389 buf = kmalloc(rlen + 1, gfp_flags);
390 if (!buf) {
391 dprintk("%s: Not enough memory\n", __func__);
392 goto out_free_netid;
393 }
394 buf[rlen] = '\0';
395 memcpy(buf, p, rlen);
396
397 /* replace port '.' with '-' */
398 portstr = strrchr(buf, '.');
399 if (!portstr) {
400 dprintk("%s: Failed finding expected dot in port\n",
401 __func__);
402 goto out_free_buf;
403 }
404 *portstr = '-';
405
406 /* find '.' between address and port */
407 portstr = strrchr(buf, '.');
408 if (!portstr) {
409 dprintk("%s: Failed finding expected dot between address and "
410 "port\n", __func__);
411 goto out_free_buf;
412 }
413 *portstr = '\0';
414
415 da = kzalloc(sizeof(*da), gfp_flags);
416 if (unlikely(!da))
417 goto out_free_buf;
418
419 INIT_LIST_HEAD(&da->da_node);
420
421 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
422 sizeof(da->da_addr))) {
423 dprintk("%s: error parsing address %s\n", __func__, buf);
424 goto out_free_da;
425 }
426
427 portstr++;
428 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
429 port = htons((tmp[0] << 8) | (tmp[1]));
430
431 switch (da->da_addr.ss_family) {
432 case AF_INET:
433 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
434 da->da_addrlen = sizeof(struct sockaddr_in);
435 match_netid = "tcp";
436 match_netid_len = 3;
437 break;
438
439 case AF_INET6:
440 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
441 da->da_addrlen = sizeof(struct sockaddr_in6);
442 match_netid = "tcp6";
443 match_netid_len = 4;
444 startsep = "[";
445 endsep = "]";
446 break;
447
448 default:
449 dprintk("%s: unsupported address family: %u\n",
450 __func__, da->da_addr.ss_family);
451 goto out_free_da;
452 }
453
454 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
455 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
456 __func__, netid, match_netid);
457 goto out_free_da;
458 }
459
460 /* save human readable address */
461 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
462 da->da_remotestr = kzalloc(len, gfp_flags);
463
464 /* NULL is ok, only used for dprintk */
465 if (da->da_remotestr)
466 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
467 buf, endsep, ntohs(port));
468
469 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
470 kfree(buf);
471 kfree(netid);
472 return da;
473
474out_free_da:
475 kfree(da);
476out_free_buf:
477 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
478 kfree(buf);
479out_free_netid:
480 kfree(netid);
481out_err:
482 return NULL;
483}
484
485/* Decode opaque device data and return the result */ 61/* Decode opaque device data and return the result */
486struct nfs4_file_layout_dsaddr * 62struct nfs4_file_layout_dsaddr *
487nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, 63nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
@@ -584,8 +160,8 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
584 160
585 mp_count = be32_to_cpup(p); /* multipath count */ 161 mp_count = be32_to_cpup(p); /* multipath count */
586 for (j = 0; j < mp_count; j++) { 162 for (j = 0; j < mp_count; j++) {
587 da = decode_ds_addr(server->nfs_client->cl_net, 163 da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
588 &stream, gfp_flags); 164 &stream, gfp_flags);
589 if (da) 165 if (da)
590 list_add_tail(&da->da_node, &dsaddrs); 166 list_add_tail(&da->da_node, &dsaddrs);
591 } 167 }
@@ -681,22 +257,7 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
681 return flseg->fh_array[i]; 257 return flseg->fh_array[i];
682} 258}
683 259
684static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) 260/* Upon return, either ds is connected, or ds is NULL */
685{
686 might_sleep();
687 wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
688 nfs_wait_bit_killable, TASK_KILLABLE);
689}
690
691static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
692{
693 smp_mb__before_atomic();
694 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
695 smp_mb__after_atomic();
696 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
697}
698
699
700struct nfs4_pnfs_ds * 261struct nfs4_pnfs_ds *
701nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) 262nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
702{ 263{
@@ -704,29 +265,23 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
704 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; 265 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
705 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); 266 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
706 struct nfs4_pnfs_ds *ret = ds; 267 struct nfs4_pnfs_ds *ret = ds;
268 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
707 269
708 if (ds == NULL) { 270 if (ds == NULL) {
709 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", 271 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
710 __func__, ds_idx); 272 __func__, ds_idx);
711 filelayout_mark_devid_invalid(devid); 273 pnfs_generic_mark_devid_invalid(devid);
712 goto out; 274 goto out;
713 } 275 }
714 smp_rmb(); 276 smp_rmb();
715 if (ds->ds_clp) 277 if (ds->ds_clp)
716 goto out_test_devid; 278 goto out_test_devid;
717 279
718 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { 280 nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
719 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); 281 dataserver_retrans, 4,
720 int err; 282 s->nfs_client->cl_minorversion,
721 283 s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
722 err = nfs4_ds_connect(s, ds); 284
723 if (err)
724 nfs4_mark_deviceid_unavailable(devid);
725 nfs4_clear_ds_conn_bit(ds);
726 } else {
727 /* Either ds is connected, or ds is NULL */
728 nfs4_wait_ds_connect(ds);
729 }
730out_test_devid: 285out_test_devid:
731 if (filelayout_test_devid_unavailable(devid)) 286 if (filelayout_test_devid_unavailable(devid))
732 ret = NULL; 287 ret = NULL;
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000000..1d2c9f6bbcd4
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Flexfile Layout Driver kernel module
3#
4obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
5nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000000..f29fb7d7e8f8
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1574 @@
1/*
2 * Module for pnfs flexfile layout driver.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#include <linux/nfs_fs.h>
10#include <linux/nfs_page.h>
11#include <linux/module.h>
12
13#include <linux/sunrpc/metrics.h>
14#include <linux/nfs_idmap.h>
15
16#include "flexfilelayout.h"
17#include "../nfs4session.h"
18#include "../internal.h"
19#include "../delegation.h"
20#include "../nfs4trace.h"
21#include "../iostat.h"
22#include "../nfs.h"
23
24#define NFSDBG_FACILITY NFSDBG_PNFS_LD
25
26#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
27
28static struct pnfs_layout_hdr *
29ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
30{
31 struct nfs4_flexfile_layout *ffl;
32
33 ffl = kzalloc(sizeof(*ffl), gfp_flags);
34 if (ffl) {
35 INIT_LIST_HEAD(&ffl->error_list);
36 return &ffl->generic_hdr;
37 } else
38 return NULL;
39}
40
41static void
42ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
43{
44 struct nfs4_ff_layout_ds_err *err, *n;
45
46 list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
47 list) {
48 list_del(&err->list);
49 kfree(err);
50 }
51 kfree(FF_LAYOUT_FROM_HDR(lo));
52}
53
54static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
55{
56 __be32 *p;
57
58 p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
59 if (unlikely(p == NULL))
60 return -ENOBUFS;
61 memcpy(stateid, p, NFS4_STATEID_SIZE);
62 dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
63 p[0], p[1], p[2], p[3]);
64 return 0;
65}
66
67static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
68{
69 __be32 *p;
70
71 p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
72 if (unlikely(!p))
73 return -ENOBUFS;
74 memcpy(devid, p, NFS4_DEVICEID4_SIZE);
75 nfs4_print_deviceid(devid);
76 return 0;
77}
78
79static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
80{
81 __be32 *p;
82
83 p = xdr_inline_decode(xdr, 4);
84 if (unlikely(!p))
85 return -ENOBUFS;
86 fh->size = be32_to_cpup(p++);
87 if (fh->size > sizeof(struct nfs_fh)) {
88 printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
89 fh->size);
90 return -EOVERFLOW;
91 }
92 /* fh.data */
93 p = xdr_inline_decode(xdr, fh->size);
94 if (unlikely(!p))
95 return -ENOBUFS;
96 memcpy(&fh->data, p, fh->size);
97 dprintk("%s: fh len %d\n", __func__, fh->size);
98
99 return 0;
100}
101
102/*
103 * Currently only stringified uids and gids are accepted.
104 * I.e., kerberos is not supported to the DSes, so no pricipals.
105 *
106 * That means that one common function will suffice, but when
107 * principals are added, this should be split to accomodate
108 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
109 */
110static int
111decode_name(struct xdr_stream *xdr, u32 *id)
112{
113 __be32 *p;
114 int len;
115
116 /* opaque_length(4)*/
117 p = xdr_inline_decode(xdr, 4);
118 if (unlikely(!p))
119 return -ENOBUFS;
120 len = be32_to_cpup(p++);
121 if (len < 0)
122 return -EINVAL;
123
124 dprintk("%s: len %u\n", __func__, len);
125
126 /* opaque body */
127 p = xdr_inline_decode(xdr, len);
128 if (unlikely(!p))
129 return -ENOBUFS;
130
131 if (!nfs_map_string_to_numeric((char *)p, len, id))
132 return -EINVAL;
133
134 return 0;
135}
136
137static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
138{
139 int i;
140
141 if (fls->mirror_array) {
142 for (i = 0; i < fls->mirror_array_cnt; i++) {
143 /* normally mirror_ds is freed in
144 * .free_deviceid_node but we still do it here
145 * for .alloc_lseg error path */
146 if (fls->mirror_array[i]) {
147 kfree(fls->mirror_array[i]->fh_versions);
148 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
149 kfree(fls->mirror_array[i]);
150 }
151 }
152 kfree(fls->mirror_array);
153 fls->mirror_array = NULL;
154 }
155}
156
157static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
158{
159 int ret = 0;
160
161 dprintk("--> %s\n", __func__);
162
163 /* FIXME: remove this check when layout segment support is added */
164 if (lgr->range.offset != 0 ||
165 lgr->range.length != NFS4_MAX_UINT64) {
166 dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
167 __func__);
168 ret = -EINVAL;
169 }
170
171 dprintk("--> %s returns %d\n", __func__, ret);
172 return ret;
173}
174
175static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
176{
177 if (fls) {
178 ff_layout_free_mirror_array(fls);
179 kfree(fls);
180 }
181}
182
183static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
184{
185 struct nfs4_ff_layout_mirror *tmp;
186 int i, j;
187
188 for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
189 for (j = i + 1; j < fls->mirror_array_cnt; j++)
190 if (fls->mirror_array[i]->efficiency <
191 fls->mirror_array[j]->efficiency) {
192 tmp = fls->mirror_array[i];
193 fls->mirror_array[i] = fls->mirror_array[j];
194 fls->mirror_array[j] = tmp;
195 }
196 }
197}
198
199static struct pnfs_layout_segment *
200ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
201 struct nfs4_layoutget_res *lgr,
202 gfp_t gfp_flags)
203{
204 struct pnfs_layout_segment *ret;
205 struct nfs4_ff_layout_segment *fls = NULL;
206 struct xdr_stream stream;
207 struct xdr_buf buf;
208 struct page *scratch;
209 u64 stripe_unit;
210 u32 mirror_array_cnt;
211 __be32 *p;
212 int i, rc;
213
214 dprintk("--> %s\n", __func__);
215 scratch = alloc_page(gfp_flags);
216 if (!scratch)
217 return ERR_PTR(-ENOMEM);
218
219 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
220 lgr->layoutp->len);
221 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
222
223 /* stripe unit and mirror_array_cnt */
224 rc = -EIO;
225 p = xdr_inline_decode(&stream, 8 + 4);
226 if (!p)
227 goto out_err_free;
228
229 p = xdr_decode_hyper(p, &stripe_unit);
230 mirror_array_cnt = be32_to_cpup(p++);
231 dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
232 stripe_unit, mirror_array_cnt);
233
234 if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
235 mirror_array_cnt == 0)
236 goto out_err_free;
237
238 rc = -ENOMEM;
239 fls = kzalloc(sizeof(*fls), gfp_flags);
240 if (!fls)
241 goto out_err_free;
242
243 fls->mirror_array_cnt = mirror_array_cnt;
244 fls->stripe_unit = stripe_unit;
245 fls->mirror_array = kcalloc(fls->mirror_array_cnt,
246 sizeof(fls->mirror_array[0]), gfp_flags);
247 if (fls->mirror_array == NULL)
248 goto out_err_free;
249
250 for (i = 0; i < fls->mirror_array_cnt; i++) {
251 struct nfs4_deviceid devid;
252 struct nfs4_deviceid_node *idnode;
253 u32 ds_count;
254 u32 fh_count;
255 int j;
256
257 rc = -EIO;
258 p = xdr_inline_decode(&stream, 4);
259 if (!p)
260 goto out_err_free;
261 ds_count = be32_to_cpup(p);
262
263 /* FIXME: allow for striping? */
264 if (ds_count != 1)
265 goto out_err_free;
266
267 fls->mirror_array[i] =
268 kzalloc(sizeof(struct nfs4_ff_layout_mirror),
269 gfp_flags);
270 if (fls->mirror_array[i] == NULL) {
271 rc = -ENOMEM;
272 goto out_err_free;
273 }
274
275 spin_lock_init(&fls->mirror_array[i]->lock);
276 fls->mirror_array[i]->ds_count = ds_count;
277
278 /* deviceid */
279 rc = decode_deviceid(&stream, &devid);
280 if (rc)
281 goto out_err_free;
282
283 idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
284 &devid, lh->plh_lc_cred,
285 gfp_flags);
286 /*
287 * upon success, mirror_ds is allocated by previous
288 * getdeviceinfo, or newly by .alloc_deviceid_node
289 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
290 */
291 if (idnode)
292 fls->mirror_array[i]->mirror_ds =
293 FF_LAYOUT_MIRROR_DS(idnode);
294 else
295 goto out_err_free;
296
297 /* efficiency */
298 rc = -EIO;
299 p = xdr_inline_decode(&stream, 4);
300 if (!p)
301 goto out_err_free;
302 fls->mirror_array[i]->efficiency = be32_to_cpup(p);
303
304 /* stateid */
305 rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
306 if (rc)
307 goto out_err_free;
308
309 /* fh */
310 p = xdr_inline_decode(&stream, 4);
311 if (!p)
312 goto out_err_free;
313 fh_count = be32_to_cpup(p);
314
315 fls->mirror_array[i]->fh_versions =
316 kzalloc(fh_count * sizeof(struct nfs_fh),
317 gfp_flags);
318 if (fls->mirror_array[i]->fh_versions == NULL) {
319 rc = -ENOMEM;
320 goto out_err_free;
321 }
322
323 for (j = 0; j < fh_count; j++) {
324 rc = decode_nfs_fh(&stream,
325 &fls->mirror_array[i]->fh_versions[j]);
326 if (rc)
327 goto out_err_free;
328 }
329
330 fls->mirror_array[i]->fh_versions_cnt = fh_count;
331
332 /* user */
333 rc = decode_name(&stream, &fls->mirror_array[i]->uid);
334 if (rc)
335 goto out_err_free;
336
337 /* group */
338 rc = decode_name(&stream, &fls->mirror_array[i]->gid);
339 if (rc)
340 goto out_err_free;
341
342 dprintk("%s: uid %d gid %d\n", __func__,
343 fls->mirror_array[i]->uid,
344 fls->mirror_array[i]->gid);
345 }
346
347 ff_layout_sort_mirrors(fls);
348 rc = ff_layout_check_layout(lgr);
349 if (rc)
350 goto out_err_free;
351
352 ret = &fls->generic_hdr;
353 dprintk("<-- %s (success)\n", __func__);
354out_free_page:
355 __free_page(scratch);
356 return ret;
357out_err_free:
358 _ff_layout_free_lseg(fls);
359 ret = ERR_PTR(rc);
360 dprintk("<-- %s (%d)\n", __func__, rc);
361 goto out_free_page;
362}
363
364static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
365{
366 struct pnfs_layout_segment *lseg;
367
368 list_for_each_entry(lseg, &layout->plh_segs, pls_list)
369 if (lseg->pls_range.iomode == IOMODE_RW)
370 return true;
371
372 return false;
373}
374
375static void
376ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
377{
378 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
379 int i;
380
381 dprintk("--> %s\n", __func__);
382
383 for (i = 0; i < fls->mirror_array_cnt; i++) {
384 if (fls->mirror_array[i]) {
385 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
386 fls->mirror_array[i]->mirror_ds = NULL;
387 if (fls->mirror_array[i]->cred) {
388 put_rpccred(fls->mirror_array[i]->cred);
389 fls->mirror_array[i]->cred = NULL;
390 }
391 }
392 }
393
394 if (lseg->pls_range.iomode == IOMODE_RW) {
395 struct nfs4_flexfile_layout *ffl;
396 struct inode *inode;
397
398 ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
399 inode = ffl->generic_hdr.plh_inode;
400 spin_lock(&inode->i_lock);
401 if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
402 ffl->commit_info.nbuckets = 0;
403 kfree(ffl->commit_info.buckets);
404 ffl->commit_info.buckets = NULL;
405 }
406 spin_unlock(&inode->i_lock);
407 }
408 _ff_layout_free_lseg(fls);
409}
410
411/* Return 1 until we have multiple lsegs support */
412static int
413ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
414{
415 return 1;
416}
417
418static int
419ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
420 struct nfs_commit_info *cinfo,
421 gfp_t gfp_flags)
422{
423 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
424 struct pnfs_commit_bucket *buckets;
425 int size;
426
427 if (cinfo->ds->nbuckets != 0) {
428 /* This assumes there is only one RW lseg per file.
429 * To support multiple lseg per file, we need to
430 * change struct pnfs_commit_bucket to allow dynamic
431 * increasing nbuckets.
432 */
433 return 0;
434 }
435
436 size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
437
438 buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
439 gfp_flags);
440 if (!buckets)
441 return -ENOMEM;
442 else {
443 int i;
444
445 spin_lock(cinfo->lock);
446 if (cinfo->ds->nbuckets != 0)
447 kfree(buckets);
448 else {
449 cinfo->ds->buckets = buckets;
450 cinfo->ds->nbuckets = size;
451 for (i = 0; i < size; i++) {
452 INIT_LIST_HEAD(&buckets[i].written);
453 INIT_LIST_HEAD(&buckets[i].committing);
454 /* mark direct verifier as unset */
455 buckets[i].direct_verf.committed =
456 NFS_INVALID_STABLE_HOW;
457 }
458 }
459 spin_unlock(cinfo->lock);
460 return 0;
461 }
462}
463
464static struct nfs4_pnfs_ds *
465ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
466 int *best_idx)
467{
468 struct nfs4_ff_layout_segment *fls;
469 struct nfs4_pnfs_ds *ds;
470 int idx;
471
472 fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
473 /* mirrors are sorted by efficiency */
474 for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
475 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
476 if (ds) {
477 *best_idx = idx;
478 return ds;
479 }
480 }
481
482 return NULL;
483}
484
485static void
486ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
487 struct nfs_page *req)
488{
489 struct nfs_pgio_mirror *pgm;
490 struct nfs4_ff_layout_mirror *mirror;
491 struct nfs4_pnfs_ds *ds;
492 int ds_idx;
493
494 /* Use full layout for now */
495 if (!pgio->pg_lseg)
496 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
497 req->wb_context,
498 0,
499 NFS4_MAX_UINT64,
500 IOMODE_READ,
501 GFP_KERNEL);
502 /* If no lseg, fall back to read through mds */
503 if (pgio->pg_lseg == NULL)
504 goto out_mds;
505
506 ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
507 if (!ds)
508 goto out_mds;
509 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
510
511 pgio->pg_mirror_idx = ds_idx;
512
513 /* read always uses only one mirror - idx 0 for pgio layer */
514 pgm = &pgio->pg_mirrors[0];
515 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
516
517 return;
518out_mds:
519 pnfs_put_lseg(pgio->pg_lseg);
520 pgio->pg_lseg = NULL;
521 nfs_pageio_reset_read_mds(pgio);
522}
523
524static void
525ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
526 struct nfs_page *req)
527{
528 struct nfs4_ff_layout_mirror *mirror;
529 struct nfs_pgio_mirror *pgm;
530 struct nfs_commit_info cinfo;
531 struct nfs4_pnfs_ds *ds;
532 int i;
533 int status;
534
535 if (!pgio->pg_lseg)
536 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
537 req->wb_context,
538 0,
539 NFS4_MAX_UINT64,
540 IOMODE_RW,
541 GFP_NOFS);
542 /* If no lseg, fall back to write through mds */
543 if (pgio->pg_lseg == NULL)
544 goto out_mds;
545
546 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
547 status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
548 if (status < 0)
549 goto out_mds;
550
551 /* Use a direct mapping of ds_idx to pgio mirror_idx */
552 if (WARN_ON_ONCE(pgio->pg_mirror_count !=
553 FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
554 goto out_mds;
555
556 for (i = 0; i < pgio->pg_mirror_count; i++) {
557 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
558 if (!ds)
559 goto out_mds;
560 pgm = &pgio->pg_mirrors[i];
561 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
562 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
563 }
564
565 return;
566
567out_mds:
568 pnfs_put_lseg(pgio->pg_lseg);
569 pgio->pg_lseg = NULL;
570 nfs_pageio_reset_write_mds(pgio);
571}
572
573static unsigned int
574ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
575 struct nfs_page *req)
576{
577 if (!pgio->pg_lseg)
578 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
579 req->wb_context,
580 0,
581 NFS4_MAX_UINT64,
582 IOMODE_RW,
583 GFP_NOFS);
584 if (pgio->pg_lseg)
585 return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
586
587 /* no lseg means that pnfs is not in use, so no mirroring here */
588 pnfs_put_lseg(pgio->pg_lseg);
589 pgio->pg_lseg = NULL;
590 nfs_pageio_reset_write_mds(pgio);
591 return 1;
592}
593
594static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
595 .pg_init = ff_layout_pg_init_read,
596 .pg_test = pnfs_generic_pg_test,
597 .pg_doio = pnfs_generic_pg_readpages,
598 .pg_cleanup = pnfs_generic_pg_cleanup,
599};
600
601static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
602 .pg_init = ff_layout_pg_init_write,
603 .pg_test = pnfs_generic_pg_test,
604 .pg_doio = pnfs_generic_pg_writepages,
605 .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
606 .pg_cleanup = pnfs_generic_pg_cleanup,
607};
608
609static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
610{
611 struct rpc_task *task = &hdr->task;
612
613 pnfs_layoutcommit_inode(hdr->inode, false);
614
615 if (retry_pnfs) {
616 dprintk("%s Reset task %5u for i/o through pNFS "
617 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
618 hdr->task.tk_pid,
619 hdr->inode->i_sb->s_id,
620 (unsigned long long)NFS_FILEID(hdr->inode),
621 hdr->args.count,
622 (unsigned long long)hdr->args.offset);
623
624 if (!hdr->dreq) {
625 struct nfs_open_context *ctx;
626
627 ctx = nfs_list_entry(hdr->pages.next)->wb_context;
628 set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
629 hdr->completion_ops->error_cleanup(&hdr->pages);
630 } else {
631 nfs_direct_set_resched_writes(hdr->dreq);
632 /* fake unstable write to let common nfs resend pages */
633 hdr->verf.committed = NFS_UNSTABLE;
634 hdr->good_bytes = 0;
635 }
636 return;
637 }
638
639 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
640 dprintk("%s Reset task %5u for i/o through MDS "
641 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
642 hdr->task.tk_pid,
643 hdr->inode->i_sb->s_id,
644 (unsigned long long)NFS_FILEID(hdr->inode),
645 hdr->args.count,
646 (unsigned long long)hdr->args.offset);
647
648 task->tk_status = pnfs_write_done_resend_to_mds(hdr);
649 }
650}
651
652static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
653{
654 struct rpc_task *task = &hdr->task;
655
656 pnfs_layoutcommit_inode(hdr->inode, false);
657
658 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
659 dprintk("%s Reset task %5u for i/o through MDS "
660 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
661 hdr->task.tk_pid,
662 hdr->inode->i_sb->s_id,
663 (unsigned long long)NFS_FILEID(hdr->inode),
664 hdr->args.count,
665 (unsigned long long)hdr->args.offset);
666
667 task->tk_status = pnfs_read_done_resend_to_mds(hdr);
668 }
669}
670
671static int ff_layout_async_handle_error_v4(struct rpc_task *task,
672 struct nfs4_state *state,
673 struct nfs_client *clp,
674 struct pnfs_layout_segment *lseg,
675 int idx)
676{
677 struct pnfs_layout_hdr *lo = lseg->pls_layout;
678 struct inode *inode = lo->plh_inode;
679 struct nfs_server *mds_server = NFS_SERVER(inode);
680
681 struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
682 struct nfs_client *mds_client = mds_server->nfs_client;
683 struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
684
685 if (task->tk_status >= 0)
686 return 0;
687
688 switch (task->tk_status) {
689 /* MDS state errors */
690 case -NFS4ERR_DELEG_REVOKED:
691 case -NFS4ERR_ADMIN_REVOKED:
692 case -NFS4ERR_BAD_STATEID:
693 if (state == NULL)
694 break;
695 nfs_remove_bad_delegation(state->inode);
696 case -NFS4ERR_OPENMODE:
697 if (state == NULL)
698 break;
699 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
700 goto out_bad_stateid;
701 goto wait_on_recovery;
702 case -NFS4ERR_EXPIRED:
703 if (state != NULL) {
704 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
705 goto out_bad_stateid;
706 }
707 nfs4_schedule_lease_recovery(mds_client);
708 goto wait_on_recovery;
709 /* DS session errors */
710 case -NFS4ERR_BADSESSION:
711 case -NFS4ERR_BADSLOT:
712 case -NFS4ERR_BAD_HIGH_SLOT:
713 case -NFS4ERR_DEADSESSION:
714 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
715 case -NFS4ERR_SEQ_FALSE_RETRY:
716 case -NFS4ERR_SEQ_MISORDERED:
717 dprintk("%s ERROR %d, Reset session. Exchangeid "
718 "flags 0x%x\n", __func__, task->tk_status,
719 clp->cl_exchange_flags);
720 nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
721 break;
722 case -NFS4ERR_DELAY:
723 case -NFS4ERR_GRACE:
724 rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
725 break;
726 case -NFS4ERR_RETRY_UNCACHED_REP:
727 break;
728 /* Invalidate Layout errors */
729 case -NFS4ERR_PNFS_NO_LAYOUT:
730 case -ESTALE: /* mapped NFS4ERR_STALE */
731 case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
732 case -EISDIR: /* mapped NFS4ERR_ISDIR */
733 case -NFS4ERR_FHEXPIRED:
734 case -NFS4ERR_WRONG_TYPE:
735 dprintk("%s Invalid layout error %d\n", __func__,
736 task->tk_status);
737 /*
738 * Destroy layout so new i/o will get a new layout.
739 * Layout will not be destroyed until all current lseg
740 * references are put. Mark layout as invalid to resend failed
741 * i/o and all i/o waiting on the slot table to the MDS until
742 * layout is destroyed and a new valid layout is obtained.
743 */
744 pnfs_destroy_layout(NFS_I(inode));
745 rpc_wake_up(&tbl->slot_tbl_waitq);
746 goto reset;
747 /* RPC connection errors */
748 case -ECONNREFUSED:
749 case -EHOSTDOWN:
750 case -EHOSTUNREACH:
751 case -ENETUNREACH:
752 case -EIO:
753 case -ETIMEDOUT:
754 case -EPIPE:
755 dprintk("%s DS connection error %d\n", __func__,
756 task->tk_status);
757 nfs4_mark_deviceid_unavailable(devid);
758 rpc_wake_up(&tbl->slot_tbl_waitq);
759 /* fall through */
760 default:
761 if (ff_layout_has_available_ds(lseg))
762 return -NFS4ERR_RESET_TO_PNFS;
763reset:
764 dprintk("%s Retry through MDS. Error %d\n", __func__,
765 task->tk_status);
766 return -NFS4ERR_RESET_TO_MDS;
767 }
768out:
769 task->tk_status = 0;
770 return -EAGAIN;
771out_bad_stateid:
772 task->tk_status = -EIO;
773 return 0;
774wait_on_recovery:
775 rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
776 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
777 rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
778 goto out;
779}
780
781/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
782static int ff_layout_async_handle_error_v3(struct rpc_task *task,
783 struct pnfs_layout_segment *lseg,
784 int idx)
785{
786 struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
787
788 if (task->tk_status >= 0)
789 return 0;
790
791 if (task->tk_status != -EJUKEBOX) {
792 dprintk("%s DS connection error %d\n", __func__,
793 task->tk_status);
794 nfs4_mark_deviceid_unavailable(devid);
795 if (ff_layout_has_available_ds(lseg))
796 return -NFS4ERR_RESET_TO_PNFS;
797 else
798 return -NFS4ERR_RESET_TO_MDS;
799 }
800
801 if (task->tk_status == -EJUKEBOX)
802 nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
803 task->tk_status = 0;
804 rpc_restart_call(task);
805 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
806 return -EAGAIN;
807}
808
809static int ff_layout_async_handle_error(struct rpc_task *task,
810 struct nfs4_state *state,
811 struct nfs_client *clp,
812 struct pnfs_layout_segment *lseg,
813 int idx)
814{
815 int vers = clp->cl_nfs_mod->rpc_vers->number;
816
817 switch (vers) {
818 case 3:
819 return ff_layout_async_handle_error_v3(task, lseg, idx);
820 case 4:
821 return ff_layout_async_handle_error_v4(task, state, clp,
822 lseg, idx);
823 default:
824 /* should never happen */
825 WARN_ON_ONCE(1);
826 return 0;
827 }
828}
829
830static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
831 int idx, u64 offset, u64 length,
832 u32 status, int opnum)
833{
834 struct nfs4_ff_layout_mirror *mirror;
835 int err;
836
837 mirror = FF_LAYOUT_COMP(lseg, idx);
838 err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
839 mirror, offset, length, status, opnum,
840 GFP_NOIO);
841 dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
842}
843
844/* NFS_PROTO call done callback routines */
845
846static int ff_layout_read_done_cb(struct rpc_task *task,
847 struct nfs_pgio_header *hdr)
848{
849 struct inode *inode;
850 int err;
851
852 trace_nfs4_pnfs_read(hdr, task->tk_status);
853 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
854 hdr->res.op_status = NFS4ERR_NXIO;
855 if (task->tk_status < 0 && hdr->res.op_status)
856 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
857 hdr->args.offset, hdr->args.count,
858 hdr->res.op_status, OP_READ);
859 err = ff_layout_async_handle_error(task, hdr->args.context->state,
860 hdr->ds_clp, hdr->lseg,
861 hdr->pgio_mirror_idx);
862
863 switch (err) {
864 case -NFS4ERR_RESET_TO_PNFS:
865 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
866 &hdr->lseg->pls_layout->plh_flags);
867 pnfs_read_resend_pnfs(hdr);
868 return task->tk_status;
869 case -NFS4ERR_RESET_TO_MDS:
870 inode = hdr->lseg->pls_layout->plh_inode;
871 pnfs_error_mark_layout_for_return(inode, hdr->lseg);
872 ff_layout_reset_read(hdr);
873 return task->tk_status;
874 case -EAGAIN:
875 rpc_restart_call_prepare(task);
876 return -EAGAIN;
877 }
878
879 return 0;
880}
881
882/*
883 * We reference the rpc_cred of the first WRITE that triggers the need for
884 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
885 * rfc5661 is not clear about which credential should be used.
886 *
887 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
888 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
889 * we always send layoutcommit after DS writes.
890 */
891static void
892ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
893{
894 pnfs_set_layoutcommit(hdr);
895 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
896 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
897}
898
899static bool
900ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
901{
902 /* No mirroring for now */
903 struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
904
905 return ff_layout_test_devid_unavailable(node);
906}
907
908static int ff_layout_read_prepare_common(struct rpc_task *task,
909 struct nfs_pgio_header *hdr)
910{
911 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
912 rpc_exit(task, -EIO);
913 return -EIO;
914 }
915 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
916 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
917 if (ff_layout_has_available_ds(hdr->lseg))
918 pnfs_read_resend_pnfs(hdr);
919 else
920 ff_layout_reset_read(hdr);
921 rpc_exit(task, 0);
922 return -EAGAIN;
923 }
924 hdr->pgio_done_cb = ff_layout_read_done_cb;
925
926 return 0;
927}
928
929/*
930 * Call ops for the async read/write cases
931 * In the case of dense layouts, the offset needs to be reset to its
932 * original value.
933 */
934static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
935{
936 struct nfs_pgio_header *hdr = data;
937
938 if (ff_layout_read_prepare_common(task, hdr))
939 return;
940
941 rpc_call_start(task);
942}
943
944static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
945 struct nfs4_sequence_args *args,
946 struct nfs4_sequence_res *res,
947 struct rpc_task *task)
948{
949 if (ds_clp->cl_session)
950 return nfs41_setup_sequence(ds_clp->cl_session,
951 args,
952 res,
953 task);
954 return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
955 args,
956 res,
957 task);
958}
959
960static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
961{
962 struct nfs_pgio_header *hdr = data;
963
964 if (ff_layout_read_prepare_common(task, hdr))
965 return;
966
967 if (ff_layout_setup_sequence(hdr->ds_clp,
968 &hdr->args.seq_args,
969 &hdr->res.seq_res,
970 task))
971 return;
972
973 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
974 hdr->args.lock_context, FMODE_READ) == -EIO)
975 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
976}
977
978static void ff_layout_read_call_done(struct rpc_task *task, void *data)
979{
980 struct nfs_pgio_header *hdr = data;
981
982 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
983
984 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
985 task->tk_status == 0) {
986 nfs4_sequence_done(task, &hdr->res.seq_res);
987 return;
988 }
989
990 /* Note this may cause RPC to be resent */
991 hdr->mds_ops->rpc_call_done(task, hdr);
992}
993
994static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
995{
996 struct nfs_pgio_header *hdr = data;
997
998 rpc_count_iostats_metrics(task,
999 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
1000}
1001
1002static int ff_layout_write_done_cb(struct rpc_task *task,
1003 struct nfs_pgio_header *hdr)
1004{
1005 struct inode *inode;
1006 int err;
1007
1008 trace_nfs4_pnfs_write(hdr, task->tk_status);
1009 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
1010 hdr->res.op_status = NFS4ERR_NXIO;
1011 if (task->tk_status < 0 && hdr->res.op_status)
1012 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1013 hdr->args.offset, hdr->args.count,
1014 hdr->res.op_status, OP_WRITE);
1015 err = ff_layout_async_handle_error(task, hdr->args.context->state,
1016 hdr->ds_clp, hdr->lseg,
1017 hdr->pgio_mirror_idx);
1018
1019 switch (err) {
1020 case -NFS4ERR_RESET_TO_PNFS:
1021 case -NFS4ERR_RESET_TO_MDS:
1022 inode = hdr->lseg->pls_layout->plh_inode;
1023 pnfs_error_mark_layout_for_return(inode, hdr->lseg);
1024 if (err == -NFS4ERR_RESET_TO_PNFS) {
1025 pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
1026 ff_layout_reset_write(hdr, true);
1027 } else {
1028 pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
1029 ff_layout_reset_write(hdr, false);
1030 }
1031 return task->tk_status;
1032 case -EAGAIN:
1033 rpc_restart_call_prepare(task);
1034 return -EAGAIN;
1035 }
1036
1037 if (hdr->res.verf->committed == NFS_FILE_SYNC ||
1038 hdr->res.verf->committed == NFS_DATA_SYNC)
1039 ff_layout_set_layoutcommit(hdr);
1040
1041 return 0;
1042}
1043
1044static int ff_layout_commit_done_cb(struct rpc_task *task,
1045 struct nfs_commit_data *data)
1046{
1047 struct inode *inode;
1048 int err;
1049
1050 trace_nfs4_pnfs_commit_ds(data, task->tk_status);
1051 if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
1052 data->res.op_status = NFS4ERR_NXIO;
1053 if (task->tk_status < 0 && data->res.op_status)
1054 ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
1055 data->args.offset, data->args.count,
1056 data->res.op_status, OP_COMMIT);
1057 err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
1058 data->lseg, data->ds_commit_index);
1059
1060 switch (err) {
1061 case -NFS4ERR_RESET_TO_PNFS:
1062 case -NFS4ERR_RESET_TO_MDS:
1063 inode = data->lseg->pls_layout->plh_inode;
1064 pnfs_error_mark_layout_for_return(inode, data->lseg);
1065 if (err == -NFS4ERR_RESET_TO_PNFS)
1066 pnfs_set_retry_layoutget(data->lseg->pls_layout);
1067 else
1068 pnfs_clear_retry_layoutget(data->lseg->pls_layout);
1069 pnfs_generic_prepare_to_resend_writes(data);
1070 return -EAGAIN;
1071 case -EAGAIN:
1072 rpc_restart_call_prepare(task);
1073 return -EAGAIN;
1074 }
1075
1076 if (data->verf.committed == NFS_UNSTABLE)
1077 pnfs_commit_set_layoutcommit(data);
1078
1079 return 0;
1080}
1081
1082static int ff_layout_write_prepare_common(struct rpc_task *task,
1083 struct nfs_pgio_header *hdr)
1084{
1085 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1086 rpc_exit(task, -EIO);
1087 return -EIO;
1088 }
1089
1090 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
1091 bool retry_pnfs;
1092
1093 retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
1094 dprintk("%s task %u reset io to %s\n", __func__,
1095 task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
1096 ff_layout_reset_write(hdr, retry_pnfs);
1097 rpc_exit(task, 0);
1098 return -EAGAIN;
1099 }
1100
1101 return 0;
1102}
1103
1104static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
1105{
1106 struct nfs_pgio_header *hdr = data;
1107
1108 if (ff_layout_write_prepare_common(task, hdr))
1109 return;
1110
1111 rpc_call_start(task);
1112}
1113
1114static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
1115{
1116 struct nfs_pgio_header *hdr = data;
1117
1118 if (ff_layout_write_prepare_common(task, hdr))
1119 return;
1120
1121 if (ff_layout_setup_sequence(hdr->ds_clp,
1122 &hdr->args.seq_args,
1123 &hdr->res.seq_res,
1124 task))
1125 return;
1126
1127 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
1128 hdr->args.lock_context, FMODE_WRITE) == -EIO)
1129 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
1130}
1131
1132static void ff_layout_write_call_done(struct rpc_task *task, void *data)
1133{
1134 struct nfs_pgio_header *hdr = data;
1135
1136 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1137 task->tk_status == 0) {
1138 nfs4_sequence_done(task, &hdr->res.seq_res);
1139 return;
1140 }
1141
1142 /* Note this may cause RPC to be resent */
1143 hdr->mds_ops->rpc_call_done(task, hdr);
1144}
1145
1146static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
1147{
1148 struct nfs_pgio_header *hdr = data;
1149
1150 rpc_count_iostats_metrics(task,
1151 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
1152}
1153
1154static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
1155{
1156 rpc_call_start(task);
1157}
1158
1159static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
1160{
1161 struct nfs_commit_data *wdata = data;
1162
1163 ff_layout_setup_sequence(wdata->ds_clp,
1164 &wdata->args.seq_args,
1165 &wdata->res.seq_res,
1166 task);
1167}
1168
1169static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
1170{
1171 struct nfs_commit_data *cdata = data;
1172
1173 rpc_count_iostats_metrics(task,
1174 &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
1175}
1176
1177static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
1178 .rpc_call_prepare = ff_layout_read_prepare_v3,
1179 .rpc_call_done = ff_layout_read_call_done,
1180 .rpc_count_stats = ff_layout_read_count_stats,
1181 .rpc_release = pnfs_generic_rw_release,
1182};
1183
1184static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
1185 .rpc_call_prepare = ff_layout_read_prepare_v4,
1186 .rpc_call_done = ff_layout_read_call_done,
1187 .rpc_count_stats = ff_layout_read_count_stats,
1188 .rpc_release = pnfs_generic_rw_release,
1189};
1190
1191static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
1192 .rpc_call_prepare = ff_layout_write_prepare_v3,
1193 .rpc_call_done = ff_layout_write_call_done,
1194 .rpc_count_stats = ff_layout_write_count_stats,
1195 .rpc_release = pnfs_generic_rw_release,
1196};
1197
1198static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
1199 .rpc_call_prepare = ff_layout_write_prepare_v4,
1200 .rpc_call_done = ff_layout_write_call_done,
1201 .rpc_count_stats = ff_layout_write_count_stats,
1202 .rpc_release = pnfs_generic_rw_release,
1203};
1204
1205static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
1206 .rpc_call_prepare = ff_layout_commit_prepare_v3,
1207 .rpc_call_done = pnfs_generic_write_commit_done,
1208 .rpc_count_stats = ff_layout_commit_count_stats,
1209 .rpc_release = pnfs_generic_commit_release,
1210};
1211
1212static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
1213 .rpc_call_prepare = ff_layout_commit_prepare_v4,
1214 .rpc_call_done = pnfs_generic_write_commit_done,
1215 .rpc_count_stats = ff_layout_commit_count_stats,
1216 .rpc_release = pnfs_generic_commit_release,
1217};
1218
1219static enum pnfs_try_status
1220ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1221{
1222 struct pnfs_layout_segment *lseg = hdr->lseg;
1223 struct nfs4_pnfs_ds *ds;
1224 struct rpc_clnt *ds_clnt;
1225 struct rpc_cred *ds_cred;
1226 loff_t offset = hdr->args.offset;
1227 u32 idx = hdr->pgio_mirror_idx;
1228 int vers;
1229 struct nfs_fh *fh;
1230
1231 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
1232 __func__, hdr->inode->i_ino,
1233 hdr->args.pgbase, (size_t)hdr->args.count, offset);
1234
1235 ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
1236 if (!ds)
1237 goto out_failed;
1238
1239 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1240 hdr->inode);
1241 if (IS_ERR(ds_clnt))
1242 goto out_failed;
1243
1244 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1245 if (IS_ERR(ds_cred))
1246 goto out_failed;
1247
1248 vers = nfs4_ff_layout_ds_version(lseg, idx);
1249
1250 dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
1251 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
1252
1253 atomic_inc(&ds->ds_clp->cl_count);
1254 hdr->ds_clp = ds->ds_clp;
1255 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1256 if (fh)
1257 hdr->args.fh = fh;
1258
1259 /*
1260 * Note that if we ever decide to split across DSes,
1261 * then we may need to handle dense-like offsets.
1262 */
1263 hdr->args.offset = offset;
1264 hdr->mds_offset = offset;
1265
1266 /* Perform an asynchronous read to ds */
1267 nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1268 vers == 3 ? &ff_layout_read_call_ops_v3 :
1269 &ff_layout_read_call_ops_v4,
1270 0, RPC_TASK_SOFTCONN);
1271
1272 return PNFS_ATTEMPTED;
1273
1274out_failed:
1275 if (ff_layout_has_available_ds(lseg))
1276 return PNFS_TRY_AGAIN;
1277 return PNFS_NOT_ATTEMPTED;
1278}
1279
1280/* Perform async writes. */
1281static enum pnfs_try_status
1282ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
1283{
1284 struct pnfs_layout_segment *lseg = hdr->lseg;
1285 struct nfs4_pnfs_ds *ds;
1286 struct rpc_clnt *ds_clnt;
1287 struct rpc_cred *ds_cred;
1288 loff_t offset = hdr->args.offset;
1289 int vers;
1290 struct nfs_fh *fh;
1291 int idx = hdr->pgio_mirror_idx;
1292
1293 ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1294 if (!ds)
1295 return PNFS_NOT_ATTEMPTED;
1296
1297 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1298 hdr->inode);
1299 if (IS_ERR(ds_clnt))
1300 return PNFS_NOT_ATTEMPTED;
1301
1302 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1303 if (IS_ERR(ds_cred))
1304 return PNFS_NOT_ATTEMPTED;
1305
1306 vers = nfs4_ff_layout_ds_version(lseg, idx);
1307
1308 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
1309 __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
1310 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
1311 vers);
1312
1313 hdr->pgio_done_cb = ff_layout_write_done_cb;
1314 atomic_inc(&ds->ds_clp->cl_count);
1315 hdr->ds_clp = ds->ds_clp;
1316 hdr->ds_commit_idx = idx;
1317 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1318 if (fh)
1319 hdr->args.fh = fh;
1320
1321 /*
1322 * Note that if we ever decide to split across DSes,
1323 * then we may need to handle dense-like offsets.
1324 */
1325 hdr->args.offset = offset;
1326
1327 /* Perform an asynchronous write */
1328 nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1329 vers == 3 ? &ff_layout_write_call_ops_v3 :
1330 &ff_layout_write_call_ops_v4,
1331 sync, RPC_TASK_SOFTCONN);
1332 return PNFS_ATTEMPTED;
1333}
1334
1335static void
1336ff_layout_mark_request_commit(struct nfs_page *req,
1337 struct pnfs_layout_segment *lseg,
1338 struct nfs_commit_info *cinfo,
1339 u32 ds_commit_idx)
1340{
1341 struct list_head *list;
1342 struct pnfs_commit_bucket *buckets;
1343
1344 spin_lock(cinfo->lock);
1345 buckets = cinfo->ds->buckets;
1346 list = &buckets[ds_commit_idx].written;
1347 if (list_empty(list)) {
1348 /* Non-empty buckets hold a reference on the lseg. That ref
1349 * is normally transferred to the COMMIT call and released
1350 * there. It could also be released if the last req is pulled
1351 * off due to a rewrite, in which case it will be done in
1352 * pnfs_common_clear_request_commit
1353 */
1354 WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
1355 buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
1356 }
1357 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1358 cinfo->ds->nwritten++;
1359
1360 /* nfs_request_add_commit_list(). We need to add req to list without
1361 * dropping cinfo lock.
1362 */
1363 set_bit(PG_CLEAN, &(req)->wb_flags);
1364 nfs_list_add_request(req, list);
1365 cinfo->mds->ncommit++;
1366 spin_unlock(cinfo->lock);
1367 if (!cinfo->dreq) {
1368 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1369 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
1370 BDI_RECLAIMABLE);
1371 __mark_inode_dirty(req->wb_context->dentry->d_inode,
1372 I_DIRTY_DATASYNC);
1373 }
1374}
1375
1376static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1377{
1378 return i;
1379}
1380
1381static struct nfs_fh *
1382select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1383{
1384 struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
1385
1386 /* FIXME: Assume that there is only one NFS version available
1387 * for the DS.
1388 */
1389 return &flseg->mirror_array[i]->fh_versions[0];
1390}
1391
1392static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1393{
1394 struct pnfs_layout_segment *lseg = data->lseg;
1395 struct nfs4_pnfs_ds *ds;
1396 struct rpc_clnt *ds_clnt;
1397 struct rpc_cred *ds_cred;
1398 u32 idx;
1399 int vers;
1400 struct nfs_fh *fh;
1401
1402 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
1403 ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1404 if (!ds)
1405 goto out_err;
1406
1407 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1408 data->inode);
1409 if (IS_ERR(ds_clnt))
1410 goto out_err;
1411
1412 ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
1413 if (IS_ERR(ds_cred))
1414 goto out_err;
1415
1416 vers = nfs4_ff_layout_ds_version(lseg, idx);
1417
1418 dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
1419 data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
1420 vers);
1421 data->commit_done_cb = ff_layout_commit_done_cb;
1422 data->cred = ds_cred;
1423 atomic_inc(&ds->ds_clp->cl_count);
1424 data->ds_clp = ds->ds_clp;
1425 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1426 if (fh)
1427 data->args.fh = fh;
1428 return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
1429 vers == 3 ? &ff_layout_commit_call_ops_v3 :
1430 &ff_layout_commit_call_ops_v4,
1431 how, RPC_TASK_SOFTCONN);
1432out_err:
1433 pnfs_generic_prepare_to_resend_writes(data);
1434 pnfs_generic_commit_release(data);
1435 return -EAGAIN;
1436}
1437
1438static int
1439ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1440 int how, struct nfs_commit_info *cinfo)
1441{
1442 return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1443 ff_layout_initiate_commit);
1444}
1445
1446static struct pnfs_ds_commit_info *
1447ff_layout_get_ds_info(struct inode *inode)
1448{
1449 struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
1450
1451 if (layout == NULL)
1452 return NULL;
1453
1454 return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
1455}
1456
1457static void
1458ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
1459{
1460 nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
1461 id_node));
1462}
1463
1464static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
1465 struct xdr_stream *xdr,
1466 const struct nfs4_layoutreturn_args *args)
1467{
1468 struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
1469 __be32 *start;
1470 int count = 0, ret = 0;
1471
1472 start = xdr_reserve_space(xdr, 4);
1473 if (unlikely(!start))
1474 return -E2BIG;
1475
1476 /* This assume we always return _ALL_ layouts */
1477 spin_lock(&hdr->plh_inode->i_lock);
1478 ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
1479 spin_unlock(&hdr->plh_inode->i_lock);
1480
1481 *start = cpu_to_be32(count);
1482
1483 return ret;
1484}
1485
1486/* report nothing for now */
1487static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
1488 struct xdr_stream *xdr,
1489 const struct nfs4_layoutreturn_args *args)
1490{
1491 __be32 *p;
1492
1493 p = xdr_reserve_space(xdr, 4);
1494 if (likely(p))
1495 *p = cpu_to_be32(0);
1496}
1497
1498static struct nfs4_deviceid_node *
1499ff_layout_alloc_deviceid_node(struct nfs_server *server,
1500 struct pnfs_device *pdev, gfp_t gfp_flags)
1501{
1502 struct nfs4_ff_layout_ds *dsaddr;
1503
1504 dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
1505 if (!dsaddr)
1506 return NULL;
1507 return &dsaddr->id_node;
1508}
1509
1510static void
1511ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
1512 struct xdr_stream *xdr,
1513 const struct nfs4_layoutreturn_args *args)
1514{
1515 struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
1516 __be32 *start;
1517
1518 dprintk("%s: Begin\n", __func__);
1519 start = xdr_reserve_space(xdr, 4);
1520 BUG_ON(!start);
1521
1522 if (ff_layout_encode_ioerr(flo, xdr, args))
1523 goto out;
1524
1525 ff_layout_encode_iostats(flo, xdr, args);
1526out:
1527 *start = cpu_to_be32((xdr->p - start - 1) * 4);
1528 dprintk("%s: Return\n", __func__);
1529}
1530
1531static struct pnfs_layoutdriver_type flexfilelayout_type = {
1532 .id = LAYOUT_FLEX_FILES,
1533 .name = "LAYOUT_FLEX_FILES",
1534 .owner = THIS_MODULE,
1535 .alloc_layout_hdr = ff_layout_alloc_layout_hdr,
1536 .free_layout_hdr = ff_layout_free_layout_hdr,
1537 .alloc_lseg = ff_layout_alloc_lseg,
1538 .free_lseg = ff_layout_free_lseg,
1539 .pg_read_ops = &ff_layout_pg_read_ops,
1540 .pg_write_ops = &ff_layout_pg_write_ops,
1541 .get_ds_info = ff_layout_get_ds_info,
1542 .free_deviceid_node = ff_layout_free_deveiceid_node,
1543 .mark_request_commit = ff_layout_mark_request_commit,
1544 .clear_request_commit = pnfs_generic_clear_request_commit,
1545 .scan_commit_lists = pnfs_generic_scan_commit_lists,
1546 .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
1547 .commit_pagelist = ff_layout_commit_pagelist,
1548 .read_pagelist = ff_layout_read_pagelist,
1549 .write_pagelist = ff_layout_write_pagelist,
1550 .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
1551 .encode_layoutreturn = ff_layout_encode_layoutreturn,
1552};
1553
1554static int __init nfs4flexfilelayout_init(void)
1555{
1556 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
1557 __func__);
1558 return pnfs_register_layoutdriver(&flexfilelayout_type);
1559}
1560
1561static void __exit nfs4flexfilelayout_exit(void)
1562{
1563 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
1564 __func__);
1565 pnfs_unregister_layoutdriver(&flexfilelayout_type);
1566}
1567
1568MODULE_ALIAS("nfs-layouttype4-4");
1569
1570MODULE_LICENSE("GPL");
1571MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
1572
1573module_init(nfs4flexfilelayout_init);
1574module_exit(nfs4flexfilelayout_exit);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000000..070f20445b2d
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
1/*
2 * NFSv4 flexfile layout driver data structures.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
10#define FS_NFS_NFS4FLEXFILELAYOUT_H
11
12#include "../pnfs.h"
13
14/* XXX: Let's filter out insanely large mirror count for now to avoid oom
15 * due to network error etc. */
16#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
17
18struct nfs4_ff_ds_version {
19 u32 version;
20 u32 minor_version;
21 u32 rsize;
22 u32 wsize;
23 bool tightly_coupled;
24};
25
26/* chained in global deviceid hlist */
27struct nfs4_ff_layout_ds {
28 struct nfs4_deviceid_node id_node;
29 u32 ds_versions_cnt;
30 struct nfs4_ff_ds_version *ds_versions;
31 struct nfs4_pnfs_ds *ds;
32};
33
34struct nfs4_ff_layout_ds_err {
35 struct list_head list; /* linked in mirror error_list */
36 u64 offset;
37 u64 length;
38 int status;
39 enum nfs_opnum4 opnum;
40 nfs4_stateid stateid;
41 struct nfs4_deviceid deviceid;
42};
43
44struct nfs4_ff_layout_mirror {
45 u32 ds_count;
46 u32 efficiency;
47 struct nfs4_ff_layout_ds *mirror_ds;
48 u32 fh_versions_cnt;
49 struct nfs_fh *fh_versions;
50 nfs4_stateid stateid;
51 struct nfs4_string user_name;
52 struct nfs4_string group_name;
53 u32 uid;
54 u32 gid;
55 struct rpc_cred *cred;
56 spinlock_t lock;
57};
58
59struct nfs4_ff_layout_segment {
60 struct pnfs_layout_segment generic_hdr;
61 u64 stripe_unit;
62 u32 mirror_array_cnt;
63 struct nfs4_ff_layout_mirror **mirror_array;
64};
65
66struct nfs4_flexfile_layout {
67 struct pnfs_layout_hdr generic_hdr;
68 struct pnfs_ds_commit_info commit_info;
69 struct list_head error_list; /* nfs4_ff_layout_ds_err */
70};
71
72static inline struct nfs4_flexfile_layout *
73FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
74{
75 return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
76}
77
78static inline struct nfs4_ff_layout_segment *
79FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
80{
81 return container_of(lseg,
82 struct nfs4_ff_layout_segment,
83 generic_hdr);
84}
85
86static inline struct nfs4_deviceid_node *
87FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
88{
89 if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
90 FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
91 FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
92 return NULL;
93 return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
94}
95
96static inline struct nfs4_ff_layout_ds *
97FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
98{
99 return container_of(node, struct nfs4_ff_layout_ds, id_node);
100}
101
102static inline struct nfs4_ff_layout_mirror *
103FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
104{
105 if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
106 return NULL;
107 return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
108}
109
110static inline u32
111FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
112{
113 return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
114}
115
116static inline bool
117ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
118{
119 return nfs4_test_deviceid_unavailable(node);
120}
121
122static inline int
123nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
124{
125 return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
126}
127
128struct nfs4_ff_layout_ds *
129nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
130 gfp_t gfp_flags);
131void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
132void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
133int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
134 struct nfs4_ff_layout_mirror *mirror, u64 offset,
135 u64 length, int status, enum nfs_opnum4 opnum,
136 gfp_t gfp_flags);
137int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
138 struct xdr_stream *xdr, int *count,
139 const struct pnfs_layout_range *range);
140struct nfs_fh *
141nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
142
143struct nfs4_pnfs_ds *
144nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
145 bool fail_return);
146
147struct rpc_clnt *
148nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
149 u32 ds_idx,
150 struct nfs_client *ds_clp,
151 struct inode *inode);
152struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
153 u32 ds_idx, struct rpc_cred *mdscred);
154bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
155#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000000..e2c01f204a95
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#include <linux/nfs_fs.h>
10#include <linux/vmalloc.h>
11#include <linux/module.h>
12#include <linux/sunrpc/addr.h>
13
14#include "../internal.h"
15#include "../nfs4session.h"
16#include "flexfilelayout.h"
17
18#define NFSDBG_FACILITY NFSDBG_PNFS_LD
19
20static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
21static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
22
23void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
24{
25 if (mirror_ds)
26 nfs4_put_deviceid_node(&mirror_ds->id_node);
27}
28
29void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
30{
31 nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
32 nfs4_pnfs_ds_put(mirror_ds->ds);
33 kfree(mirror_ds);
34}
35
36/* Decode opaque device data and construct new_ds using it */
37struct nfs4_ff_layout_ds *
38nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
39 gfp_t gfp_flags)
40{
41 struct xdr_stream stream;
42 struct xdr_buf buf;
43 struct page *scratch;
44 struct list_head dsaddrs;
45 struct nfs4_pnfs_ds_addr *da;
46 struct nfs4_ff_layout_ds *new_ds = NULL;
47 struct nfs4_ff_ds_version *ds_versions = NULL;
48 u32 mp_count;
49 u32 version_count;
50 __be32 *p;
51 int i, ret = -ENOMEM;
52
53 /* set up xdr stream */
54 scratch = alloc_page(gfp_flags);
55 if (!scratch)
56 goto out_err;
57
58 new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
59 if (!new_ds)
60 goto out_scratch;
61
62 nfs4_init_deviceid_node(&new_ds->id_node,
63 server,
64 &pdev->dev_id);
65 INIT_LIST_HEAD(&dsaddrs);
66
67 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
68 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
69
70 /* multipath count */
71 p = xdr_inline_decode(&stream, 4);
72 if (unlikely(!p))
73 goto out_err_drain_dsaddrs;
74 mp_count = be32_to_cpup(p);
75 dprintk("%s: multipath ds count %d\n", __func__, mp_count);
76
77 for (i = 0; i < mp_count; i++) {
78 /* multipath ds */
79 da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
80 &stream, gfp_flags);
81 if (da)
82 list_add_tail(&da->da_node, &dsaddrs);
83 }
84 if (list_empty(&dsaddrs)) {
85 dprintk("%s: no suitable DS addresses found\n",
86 __func__);
87 ret = -ENOMEDIUM;
88 goto out_err_drain_dsaddrs;
89 }
90
91 /* version count */
92 p = xdr_inline_decode(&stream, 4);
93 if (unlikely(!p))
94 goto out_err_drain_dsaddrs;
95 version_count = be32_to_cpup(p);
96 dprintk("%s: version count %d\n", __func__, version_count);
97
98 ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
99 gfp_flags);
100 if (!ds_versions)
101 goto out_scratch;
102
103 for (i = 0; i < version_count; i++) {
104 /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
105 * tightly_coupled(4) */
106 p = xdr_inline_decode(&stream, 20);
107 if (unlikely(!p))
108 goto out_err_drain_dsaddrs;
109 ds_versions[i].version = be32_to_cpup(p++);
110 ds_versions[i].minor_version = be32_to_cpup(p++);
111 ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
112 ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
113 ds_versions[i].tightly_coupled = be32_to_cpup(p);
114
115 if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
116 ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
117 if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
118 ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
119
120 if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
121 dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
122 i, ds_versions[i].version,
123 ds_versions[i].minor_version);
124 ret = -EPROTONOSUPPORT;
125 goto out_err_drain_dsaddrs;
126 }
127
128 dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
129 __func__, i, ds_versions[i].version,
130 ds_versions[i].minor_version,
131 ds_versions[i].rsize,
132 ds_versions[i].wsize,
133 ds_versions[i].tightly_coupled);
134 }
135
136 new_ds->ds_versions = ds_versions;
137 new_ds->ds_versions_cnt = version_count;
138
139 new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
140 if (!new_ds->ds)
141 goto out_err_drain_dsaddrs;
142
143 /* If DS was already in cache, free ds addrs */
144 while (!list_empty(&dsaddrs)) {
145 da = list_first_entry(&dsaddrs,
146 struct nfs4_pnfs_ds_addr,
147 da_node);
148 list_del_init(&da->da_node);
149 kfree(da->da_remotestr);
150 kfree(da);
151 }
152
153 __free_page(scratch);
154 return new_ds;
155
156out_err_drain_dsaddrs:
157 while (!list_empty(&dsaddrs)) {
158 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
159 da_node);
160 list_del_init(&da->da_node);
161 kfree(da->da_remotestr);
162 kfree(da);
163 }
164
165 kfree(ds_versions);
166out_scratch:
167 __free_page(scratch);
168out_err:
169 kfree(new_ds);
170
171 dprintk("%s ERROR: returning %d\n", __func__, ret);
172 return NULL;
173}
174
175static u64
176end_offset(u64 start, u64 len)
177{
178 u64 end;
179
180 end = start + len;
181 return end >= start ? end : NFS4_MAX_UINT64;
182}
183
184static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
185 u64 offset, u64 length)
186{
187 u64 end;
188
189 end = max_t(u64, end_offset(err->offset, err->length),
190 end_offset(offset, length));
191 err->offset = min_t(u64, err->offset, offset);
192 err->length = end - err->offset;
193}
194
195static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset,
196 u64 length, int status, enum nfs_opnum4 opnum,
197 nfs4_stateid *stateid,
198 struct nfs4_deviceid *deviceid)
199{
200 return err->status == status && err->opnum == opnum &&
201 nfs4_stateid_match(&err->stateid, stateid) &&
202 !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
203 end_offset(err->offset, err->length) >= offset &&
204 err->offset <= end_offset(offset, length);
205}
206
207static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
208 struct nfs4_ff_layout_ds_err *new)
209{
210 if (!ds_error_can_merge(old, new->offset, new->length, new->status,
211 new->opnum, &new->stateid, &new->deviceid))
212 return false;
213
214 extend_ds_error(old, new->offset, new->length);
215 return true;
216}
217
218static bool
219ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
220 struct nfs4_ff_layout_ds_err *dserr)
221{
222 struct nfs4_ff_layout_ds_err *err;
223
224 list_for_each_entry(err, &flo->error_list, list) {
225 if (merge_ds_error(err, dserr)) {
226 return true;
227 }
228 }
229
230 list_add(&dserr->list, &flo->error_list);
231 return false;
232}
233
234static bool
235ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
236 u64 length, int status, enum nfs_opnum4 opnum,
237 nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
238{
239 bool found = false;
240 struct nfs4_ff_layout_ds_err *err;
241
242 list_for_each_entry(err, &flo->error_list, list) {
243 if (ds_error_can_merge(err, offset, length, status, opnum,
244 stateid, deviceid)) {
245 found = true;
246 extend_ds_error(err, offset, length);
247 break;
248 }
249 }
250
251 return found;
252}
253
254int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
255 struct nfs4_ff_layout_mirror *mirror, u64 offset,
256 u64 length, int status, enum nfs_opnum4 opnum,
257 gfp_t gfp_flags)
258{
259 struct nfs4_ff_layout_ds_err *dserr;
260 bool needfree;
261
262 if (status == 0)
263 return 0;
264
265 if (mirror->mirror_ds == NULL)
266 return -EINVAL;
267
268 spin_lock(&flo->generic_hdr.plh_inode->i_lock);
269 if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
270 &mirror->stateid,
271 &mirror->mirror_ds->id_node.deviceid)) {
272 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
273 return 0;
274 }
275 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
276 dserr = kmalloc(sizeof(*dserr), gfp_flags);
277 if (!dserr)
278 return -ENOMEM;
279
280 INIT_LIST_HEAD(&dserr->list);
281 dserr->offset = offset;
282 dserr->length = length;
283 dserr->status = status;
284 dserr->opnum = opnum;
285 nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
286 memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
287 NFS4_DEVICEID4_SIZE);
288
289 spin_lock(&flo->generic_hdr.plh_inode->i_lock);
290 needfree = ff_layout_add_ds_error_locked(flo, dserr);
291 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
292 if (needfree)
293 kfree(dserr);
294
295 return 0;
296}
297
298/* currently we only support AUTH_NONE and AUTH_SYS */
299static rpc_authflavor_t
300nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
301{
302 if (mirror->uid == (u32)-1)
303 return RPC_AUTH_NULL;
304 return RPC_AUTH_UNIX;
305}
306
307/* fetch cred for NFSv3 DS */
308static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
309 struct nfs4_pnfs_ds *ds)
310{
311 if (ds->ds_clp && !mirror->cred &&
312 mirror->mirror_ds->ds_versions[0].version == 3) {
313 struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
314 struct rpc_cred *cred;
315 struct auth_cred acred = {
316 .uid = make_kuid(&init_user_ns, mirror->uid),
317 .gid = make_kgid(&init_user_ns, mirror->gid),
318 };
319
320 /* AUTH_NULL ignores acred */
321 cred = auth->au_ops->lookup_cred(auth, &acred, 0);
322 if (IS_ERR(cred)) {
323 dprintk("%s: lookup_cred failed with %ld\n",
324 __func__, PTR_ERR(cred));
325 return PTR_ERR(cred);
326 } else {
327 mirror->cred = cred;
328 }
329 }
330 return 0;
331}
332
333struct nfs_fh *
334nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
335{
336 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
337 struct nfs_fh *fh = NULL;
338 struct nfs4_deviceid_node *devid;
339
340 if (mirror == NULL || mirror->mirror_ds == NULL ||
341 mirror->mirror_ds->ds == NULL) {
342 printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
343 __func__, mirror_idx);
344 if (mirror && mirror->mirror_ds) {
345 devid = &mirror->mirror_ds->id_node;
346 pnfs_generic_mark_devid_invalid(devid);
347 }
348 goto out;
349 }
350
351 /* FIXME: For now assume there is only 1 version available for the DS */
352 fh = &mirror->fh_versions[0];
353out:
354 return fh;
355}
356
357/* Upon return, either ds is connected, or ds is NULL */
358struct nfs4_pnfs_ds *
359nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
360 bool fail_return)
361{
362 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
363 struct nfs4_pnfs_ds *ds = NULL;
364 struct nfs4_deviceid_node *devid;
365 struct inode *ino = lseg->pls_layout->plh_inode;
366 struct nfs_server *s = NFS_SERVER(ino);
367 unsigned int max_payload;
368 rpc_authflavor_t flavor;
369
370 if (mirror == NULL || mirror->mirror_ds == NULL ||
371 mirror->mirror_ds->ds == NULL) {
372 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
373 __func__, ds_idx);
374 if (mirror && mirror->mirror_ds) {
375 devid = &mirror->mirror_ds->id_node;
376 pnfs_generic_mark_devid_invalid(devid);
377 }
378 goto out;
379 }
380
381 devid = &mirror->mirror_ds->id_node;
382 if (ff_layout_test_devid_unavailable(devid))
383 goto out;
384
385 ds = mirror->mirror_ds->ds;
386 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
387 smp_rmb();
388 if (ds->ds_clp)
389 goto out;
390
391 flavor = nfs4_ff_layout_choose_authflavor(mirror);
392
393 /* FIXME: For now we assume the server sent only one version of NFS
394 * to use for the DS.
395 */
396 nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
397 dataserver_retrans,
398 mirror->mirror_ds->ds_versions[0].version,
399 mirror->mirror_ds->ds_versions[0].minor_version,
400 flavor);
401
402 /* connect success, check rsize/wsize limit */
403 if (ds->ds_clp) {
404 max_payload =
405 nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
406 NULL);
407 if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
408 mirror->mirror_ds->ds_versions[0].rsize = max_payload;
409 if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
410 mirror->mirror_ds->ds_versions[0].wsize = max_payload;
411 } else {
412 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
413 mirror, lseg->pls_range.offset,
414 lseg->pls_range.length, NFS4ERR_NXIO,
415 OP_ILLEGAL, GFP_NOIO);
416 if (fail_return) {
417 pnfs_error_mark_layout_for_return(ino, lseg);
418 if (ff_layout_has_available_ds(lseg))
419 pnfs_set_retry_layoutget(lseg->pls_layout);
420 else
421 pnfs_clear_retry_layoutget(lseg->pls_layout);
422
423 } else {
424 if (ff_layout_has_available_ds(lseg))
425 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
426 &lseg->pls_layout->plh_flags);
427 else {
428 pnfs_error_mark_layout_for_return(ino, lseg);
429 pnfs_clear_retry_layoutget(lseg->pls_layout);
430 }
431 }
432 }
433
434 if (ff_layout_update_mirror_cred(mirror, ds))
435 ds = NULL;
436out:
437 return ds;
438}
439
440struct rpc_cred *
441ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
442 struct rpc_cred *mdscred)
443{
444 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
445 struct rpc_cred *cred = ERR_PTR(-EINVAL);
446
447 if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
448 goto out;
449
450 if (mirror && mirror->cred)
451 cred = mirror->cred;
452 else
453 cred = mdscred;
454out:
455 return cred;
456}
457
458/**
459* Find or create a DS rpc client with th MDS server rpc client auth flavor
460* in the nfs_client cl_ds_clients list.
461*/
462struct rpc_clnt *
463nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
464 struct nfs_client *ds_clp, struct inode *inode)
465{
466 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
467
468 switch (mirror->mirror_ds->ds_versions[0].version) {
469 case 3:
470 /* For NFSv3 DS, flavor is set when creating DS connections */
471 return ds_clp->cl_rpcclient;
472 case 4:
473 return nfs4_find_or_create_ds_client(ds_clp, inode);
474 default:
475 BUG();
476 }
477}
478
479static bool is_range_intersecting(u64 offset1, u64 length1,
480 u64 offset2, u64 length2)
481{
482 u64 end1 = end_offset(offset1, length1);
483 u64 end2 = end_offset(offset2, length2);
484
485 return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
486 (end2 == NFS4_MAX_UINT64 || end2 > offset1);
487}
488
489/* called with inode i_lock held */
490int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
491 struct xdr_stream *xdr, int *count,
492 const struct pnfs_layout_range *range)
493{
494 struct nfs4_ff_layout_ds_err *err, *n;
495 __be32 *p;
496
497 list_for_each_entry_safe(err, n, &flo->error_list, list) {
498 if (!is_range_intersecting(err->offset, err->length,
499 range->offset, range->length))
500 continue;
501 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
502 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
503 */
504 p = xdr_reserve_space(xdr,
505 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
506 if (unlikely(!p))
507 return -ENOBUFS;
508 p = xdr_encode_hyper(p, err->offset);
509 p = xdr_encode_hyper(p, err->length);
510 p = xdr_encode_opaque_fixed(p, &err->stateid,
511 NFS4_STATEID_SIZE);
512 p = xdr_encode_opaque_fixed(p, &err->deviceid,
513 NFS4_DEVICEID4_SIZE);
514 *p++ = cpu_to_be32(err->status);
515 *p++ = cpu_to_be32(err->opnum);
516 *count += 1;
517 list_del(&err->list);
518 dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
519 __func__, err->offset, err->length, err->status,
520 err->opnum, *count);
521 kfree(err);
522 }
523
524 return 0;
525}
526
527bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
528{
529 struct nfs4_ff_layout_mirror *mirror;
530 struct nfs4_deviceid_node *devid;
531 int idx;
532
533 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
534 mirror = FF_LAYOUT_COMP(lseg, idx);
535 if (mirror && mirror->mirror_ds) {
536 devid = &mirror->mirror_ds->id_node;
537 if (!ff_layout_test_devid_unavailable(devid))
538 return true;
539 }
540 }
541
542 return false;
543}
544
545module_param(dataserver_retrans, uint, 0644);
546MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
547 "retries a request before it attempts further "
548 " recovery action.");
549module_param(dataserver_timeo, uint, 0644);
550MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
551 "NFSv4.1 client waits for a response from a "
552 " data server before it retries an NFS request.");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2f5db844c172..857e2a99acc8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f
152 nfs_fattr_free_group_name(fattr); 152 nfs_fattr_free_group_name(fattr);
153} 153}
154 154
155static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) 155int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
156{ 156{
157 unsigned long val; 157 unsigned long val;
158 char buf[16]; 158 char buf[16];
@@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
166 *res = val; 166 *res = val;
167 return 1; 167 return 1;
168} 168}
169EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
169 170
170static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) 171static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
171{ 172{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2211f6ba8736..d2398c193bda 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -507,10 +507,15 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
507 attr->ia_valid &= ~ATTR_MODE; 507 attr->ia_valid &= ~ATTR_MODE;
508 508
509 if (attr->ia_valid & ATTR_SIZE) { 509 if (attr->ia_valid & ATTR_SIZE) {
510 loff_t i_size;
511
510 BUG_ON(!S_ISREG(inode->i_mode)); 512 BUG_ON(!S_ISREG(inode->i_mode));
511 513
512 if (attr->ia_size == i_size_read(inode)) 514 i_size = i_size_read(inode);
515 if (attr->ia_size == i_size)
513 attr->ia_valid &= ~ATTR_SIZE; 516 attr->ia_valid &= ~ATTR_SIZE;
517 else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
518 return -ETXTBSY;
514 } 519 }
515 520
516 /* Optimization: if the end result is no change, don't RPC */ 521 /* Optimization: if the end result is no change, don't RPC */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b6f34bfa6fe8..21469e6e3834 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -6,6 +6,7 @@
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/crc32.h> 8#include <linux/crc32.h>
9#include <linux/nfs_page.h>
9 10
10#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) 11#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
11 12
@@ -187,9 +188,15 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
187 const struct sockaddr *ds_addr, 188 const struct sockaddr *ds_addr,
188 int ds_addrlen, int ds_proto, 189 int ds_addrlen, int ds_proto,
189 unsigned int ds_timeo, 190 unsigned int ds_timeo,
190 unsigned int ds_retrans); 191 unsigned int ds_retrans,
192 u32 minor_version,
193 rpc_authflavor_t au_flavor);
191extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, 194extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
192 struct inode *); 195 struct inode *);
196extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
197 const struct sockaddr *ds_addr, int ds_addrlen,
198 int ds_proto, unsigned int ds_timeo,
199 unsigned int ds_retrans, rpc_authflavor_t au_flavor);
193#ifdef CONFIG_PROC_FS 200#ifdef CONFIG_PROC_FS
194extern int __init nfs_fs_proc_init(void); 201extern int __init nfs_fs_proc_init(void);
195extern void nfs_fs_proc_exit(void); 202extern void nfs_fs_proc_exit(void);
@@ -242,9 +249,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
242void nfs_pgio_header_free(struct nfs_pgio_header *); 249void nfs_pgio_header_free(struct nfs_pgio_header *);
243void nfs_pgio_data_destroy(struct nfs_pgio_header *); 250void nfs_pgio_data_destroy(struct nfs_pgio_header *);
244int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); 251int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
245int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *, 252int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
246 const struct rpc_call_ops *, int, int); 253 struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
254 const struct rpc_call_ops *call_ops, int how, int flags);
247void nfs_free_request(struct nfs_page *req); 255void nfs_free_request(struct nfs_page *req);
256struct nfs_pgio_mirror *
257nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
248 258
249static inline void nfs_iocounter_init(struct nfs_io_counter *c) 259static inline void nfs_iocounter_init(struct nfs_io_counter *c)
250{ 260{
@@ -252,6 +262,12 @@ static inline void nfs_iocounter_init(struct nfs_io_counter *c)
252 atomic_set(&c->io_count, 0); 262 atomic_set(&c->io_count, 0);
253} 263}
254 264
265static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
266{
267 WARN_ON_ONCE(desc->pg_mirror_count < 1);
268 return desc->pg_mirror_count > 1;
269}
270
255/* nfs2xdr.c */ 271/* nfs2xdr.c */
256extern struct rpc_procinfo nfs_procedures[]; 272extern struct rpc_procinfo nfs_procedures[];
257extern int nfs2_decode_dirent(struct xdr_stream *, 273extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -375,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
375 391
376extern int __init register_nfs_fs(void); 392extern int __init register_nfs_fs(void);
377extern void __exit unregister_nfs_fs(void); 393extern void __exit unregister_nfs_fs(void);
378extern void nfs_sb_active(struct super_block *sb); 394extern bool nfs_sb_active(struct super_block *sb);
379extern void nfs_sb_deactive(struct super_block *sb); 395extern void nfs_sb_deactive(struct super_block *sb);
380 396
381/* namespace.c */ 397/* namespace.c */
@@ -427,6 +443,7 @@ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
427extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); 443extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
428extern int nfs_initiate_commit(struct rpc_clnt *clnt, 444extern int nfs_initiate_commit(struct rpc_clnt *clnt,
429 struct nfs_commit_data *data, 445 struct nfs_commit_data *data,
446 const struct nfs_rpc_ops *nfs_ops,
430 const struct rpc_call_ops *call_ops, 447 const struct rpc_call_ops *call_ops,
431 int how, int flags); 448 int how, int flags);
432extern void nfs_init_commit(struct nfs_commit_data *data, 449extern void nfs_init_commit(struct nfs_commit_data *data,
@@ -440,13 +457,15 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
440 struct nfs_commit_info *cinfo); 457 struct nfs_commit_info *cinfo);
441void nfs_mark_request_commit(struct nfs_page *req, 458void nfs_mark_request_commit(struct nfs_page *req,
442 struct pnfs_layout_segment *lseg, 459 struct pnfs_layout_segment *lseg,
443 struct nfs_commit_info *cinfo); 460 struct nfs_commit_info *cinfo,
461 u32 ds_commit_idx);
444int nfs_write_need_commit(struct nfs_pgio_header *); 462int nfs_write_need_commit(struct nfs_pgio_header *);
445int nfs_generic_commit_list(struct inode *inode, struct list_head *head, 463int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
446 int how, struct nfs_commit_info *cinfo); 464 int how, struct nfs_commit_info *cinfo);
447void nfs_retry_commit(struct list_head *page_list, 465void nfs_retry_commit(struct list_head *page_list,
448 struct pnfs_layout_segment *lseg, 466 struct pnfs_layout_segment *lseg,
449 struct nfs_commit_info *cinfo); 467 struct nfs_commit_info *cinfo,
468 u32 ds_commit_idx);
450void nfs_commitdata_release(struct nfs_commit_data *data); 469void nfs_commitdata_release(struct nfs_commit_data *data);
451void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, 470void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
452 struct nfs_commit_info *cinfo); 471 struct nfs_commit_info *cinfo);
@@ -457,6 +476,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
457 struct nfs_direct_req *dreq); 476 struct nfs_direct_req *dreq);
458int nfs_key_timeout_notify(struct file *filp, struct inode *inode); 477int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
459bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); 478bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
479void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
460 480
461#ifdef CONFIG_MIGRATION 481#ifdef CONFIG_MIGRATION
462extern int nfs_migrate_page(struct address_space *, 482extern int nfs_migrate_page(struct address_space *,
@@ -480,6 +500,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
480 inode_dio_wait(inode); 500 inode_dio_wait(inode);
481} 501}
482extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); 502extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
503extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
483 504
484/* nfs4proc.c */ 505/* nfs4proc.c */
485extern void __nfs4_read_done_cb(struct nfs_pgio_header *); 506extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -493,6 +514,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
493 struct nfs_client **result, 514 struct nfs_client **result,
494 struct rpc_cred *cred); 515 struct rpc_cred *cred);
495 516
517static inline struct inode *nfs_igrab_and_active(struct inode *inode)
518{
519 inode = igrab(inode);
520 if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
521 iput(inode);
522 inode = NULL;
523 }
524 return inode;
525}
526
527static inline void nfs_iput_and_deactive(struct inode *inode)
528{
529 if (inode != NULL) {
530 struct super_block *sb = inode->i_sb;
531
532 iput(inode);
533 nfs_sb_deactive(sb);
534 }
535}
536
496/* 537/*
497 * Determine the device name as a string 538 * Determine the device name as a string
498 */ 539 */
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5f61b83f4a1c..b4e03ed8599d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -481,7 +481,8 @@ out_overflow:
481 * void; 481 * void;
482 * }; 482 * };
483 */ 483 */
484static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) 484static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
485 __u32 *op_status)
485{ 486{
486 enum nfs_stat status; 487 enum nfs_stat status;
487 int error; 488 int error;
@@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
489 error = decode_stat(xdr, &status); 490 error = decode_stat(xdr, &status);
490 if (unlikely(error)) 491 if (unlikely(error))
491 goto out; 492 goto out;
493 if (op_status)
494 *op_status = status;
492 if (status != NFS_OK) 495 if (status != NFS_OK)
493 goto out_default; 496 goto out_default;
494 error = decode_fattr(xdr, result); 497 error = decode_fattr(xdr, result);
@@ -808,7 +811,7 @@ out_default:
808static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, 811static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
809 struct nfs_fattr *result) 812 struct nfs_fattr *result)
810{ 813{
811 return decode_attrstat(xdr, result); 814 return decode_attrstat(xdr, result, NULL);
812} 815}
813 816
814static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, 817static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
865 error = decode_stat(xdr, &status); 868 error = decode_stat(xdr, &status);
866 if (unlikely(error)) 869 if (unlikely(error))
867 goto out; 870 goto out;
871 result->op_status = status;
868 if (status != NFS_OK) 872 if (status != NFS_OK)
869 goto out_default; 873 goto out_default;
870 error = decode_fattr(xdr, result->fattr); 874 error = decode_fattr(xdr, result->fattr);
@@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
882{ 886{
883 /* All NFSv2 writes are "file sync" writes */ 887 /* All NFSv2 writes are "file sync" writes */
884 result->verf->committed = NFS_FILE_SYNC; 888 result->verf->committed = NFS_FILE_SYNC;
885 return decode_attrstat(xdr, result->fattr); 889 return decode_attrstat(xdr, result->fattr, &result->op_status);
886} 890}
887 891
888/** 892/**
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 333ae4068506..e134d6548ab7 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver
30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, 30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
31 struct nfs_fattr *, rpc_authflavor_t); 31 struct nfs_fattr *, rpc_authflavor_t);
32 32
33/* nfs3super.c */
34extern struct nfs_subversion nfs_v3;
33 35
34#endif /* __LINUX_FS_NFS_NFS3_FS_H */ 36#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 8c1b437c5403..9e9fa347a948 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,5 +1,6 @@
1#include <linux/nfs_fs.h> 1#include <linux/nfs_fs.h>
2#include <linux/nfs_mount.h> 2#include <linux/nfs_mount.h>
3#include <linux/sunrpc/addr.h>
3#include "internal.h" 4#include "internal.h"
4#include "nfs3_fs.h" 5#include "nfs3_fs.h"
5 6
@@ -64,3 +65,43 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
64 nfs_init_server_aclclient(server); 65 nfs_init_server_aclclient(server);
65 return server; 66 return server;
66} 67}
68
69/*
70 * Set up a pNFS Data Server client over NFSv3.
71 *
72 * Return any existing nfs_client that matches server address,port,version
73 * and minorversion.
74 *
75 * For a new nfs_client, use a soft mount (default), a low retrans and a
76 * low timeout interval so that if a connection is lost, we retry through
77 * the MDS.
78 */
79struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
80 const struct sockaddr *ds_addr, int ds_addrlen,
81 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
82 rpc_authflavor_t au_flavor)
83{
84 struct nfs_client_initdata cl_init = {
85 .addr = ds_addr,
86 .addrlen = ds_addrlen,
87 .nfs_mod = &nfs_v3,
88 .proto = ds_proto,
89 .net = mds_clp->cl_net,
90 };
91 struct rpc_timeout ds_timeout;
92 struct nfs_client *clp;
93 char buf[INET6_ADDRSTRLEN + 1];
94
95 /* fake a hostname because lockd wants it */
96 if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
97 return ERR_PTR(-EINVAL);
98 cl_init.hostname = buf;
99
100 /* Use the MDS nfs_client cl_ipaddr. */
101 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
102 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
103 au_flavor);
104
105 return clp;
106}
107EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 524f9f837408..78e557c3ab87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -800,6 +800,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
800{ 800{
801 struct inode *inode = hdr->inode; 801 struct inode *inode = hdr->inode;
802 802
803 if (hdr->pgio_done_cb != NULL)
804 return hdr->pgio_done_cb(task, hdr);
805
803 if (nfs3_async_handle_jukebox(task, inode)) 806 if (nfs3_async_handle_jukebox(task, inode))
804 return -EAGAIN; 807 return -EAGAIN;
805 808
@@ -825,6 +828,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
825{ 828{
826 struct inode *inode = hdr->inode; 829 struct inode *inode = hdr->inode;
827 830
831 if (hdr->pgio_done_cb != NULL)
832 return hdr->pgio_done_cb(task, hdr);
833
828 if (nfs3_async_handle_jukebox(task, inode)) 834 if (nfs3_async_handle_jukebox(task, inode))
829 return -EAGAIN; 835 return -EAGAIN;
830 if (task->tk_status >= 0) 836 if (task->tk_status >= 0)
@@ -845,6 +851,9 @@ static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commi
845 851
846static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) 852static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
847{ 853{
854 if (data->commit_done_cb != NULL)
855 return data->commit_done_cb(task, data);
856
848 if (nfs3_async_handle_jukebox(task, data->inode)) 857 if (nfs3_async_handle_jukebox(task, data->inode))
849 return -EAGAIN; 858 return -EAGAIN;
850 nfs_refresh_inode(data->inode, data->res.fattr); 859 nfs_refresh_inode(data->inode, data->res.fattr);
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 6af29c2da352..5c4394e4656b 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -7,7 +7,7 @@
7#include "nfs3_fs.h" 7#include "nfs3_fs.h"
8#include "nfs.h" 8#include "nfs.h"
9 9
10static struct nfs_subversion nfs_v3 = { 10struct nfs_subversion nfs_v3 = {
11 .owner = THIS_MODULE, 11 .owner = THIS_MODULE,
12 .nfs_fs = &nfs_fs_type, 12 .nfs_fs = &nfs_fs_type,
13 .rpc_vers = &nfs_version3, 13 .rpc_vers = &nfs_version3,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8f4cbe7f4aa8..2a932fdc57cb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1636 error = decode_post_op_attr(xdr, result->fattr); 1636 error = decode_post_op_attr(xdr, result->fattr);
1637 if (unlikely(error)) 1637 if (unlikely(error))
1638 goto out; 1638 goto out;
1639 result->op_status = status;
1639 if (status != NFS3_OK) 1640 if (status != NFS3_OK)
1640 goto out_status; 1641 goto out_status;
1641 error = decode_read3resok(xdr, result); 1642 error = decode_read3resok(xdr, result);
@@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1708 error = decode_wcc_data(xdr, result->fattr); 1709 error = decode_wcc_data(xdr, result->fattr);
1709 if (unlikely(error)) 1710 if (unlikely(error))
1710 goto out; 1711 goto out;
1712 result->op_status = status;
1711 if (status != NFS3_OK) 1713 if (status != NFS3_OK)
1712 goto out_status; 1714 goto out_status;
1713 error = decode_write3resok(xdr, result); 1715 error = decode_write3resok(xdr, result);
@@ -2323,6 +2325,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
2323 error = decode_wcc_data(xdr, result->fattr); 2325 error = decode_wcc_data(xdr, result->fattr);
2324 if (unlikely(error)) 2326 if (unlikely(error))
2325 goto out; 2327 goto out;
2328 result->op_status = status;
2326 if (status != NFS3_OK) 2329 if (status != NFS3_OK)
2327 goto out_status; 2330 goto out_status;
2328 error = decode_writeverf3(xdr, &result->verf->verifier); 2331 error = decode_writeverf3(xdr, &result->verf->verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a08178764cf9..fdef424b0cd3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
44#define NFS4_RENEW_TIMEOUT 0x01 44#define NFS4_RENEW_TIMEOUT 0x01
45#define NFS4_RENEW_DELEGATION_CB 0x02 45#define NFS4_RENEW_DELEGATION_CB 0x02
46 46
47struct nfs_seqid_counter;
47struct nfs4_minor_version_ops { 48struct nfs4_minor_version_ops {
48 u32 minor_version; 49 u32 minor_version;
49 unsigned init_caps; 50 unsigned init_caps;
@@ -56,6 +57,8 @@ struct nfs4_minor_version_ops {
56 struct nfs_fsinfo *); 57 struct nfs_fsinfo *);
57 void (*free_lock_state)(struct nfs_server *, 58 void (*free_lock_state)(struct nfs_server *,
58 struct nfs4_lock_state *); 59 struct nfs4_lock_state *);
60 struct nfs_seqid *
61 (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
59 const struct rpc_call_ops *call_sync_ops; 62 const struct rpc_call_ops *call_sync_ops;
60 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 63 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
61 const struct nfs4_state_recovery_ops *nograce_recovery_ops; 64 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -443,6 +446,12 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
443extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); 446extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
444extern void nfs_release_seqid(struct nfs_seqid *seqid); 447extern void nfs_release_seqid(struct nfs_seqid *seqid);
445extern void nfs_free_seqid(struct nfs_seqid *seqid); 448extern void nfs_free_seqid(struct nfs_seqid *seqid);
449extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
450 struct nfs4_sequence_args *args,
451 struct nfs4_sequence_res *res,
452 struct rpc_task *task);
453extern int nfs4_sequence_done(struct rpc_task *task,
454 struct nfs4_sequence_res *res);
446 455
447extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); 456extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
448 457
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 706ad10b8186..8646af9b11d2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -849,14 +849,15 @@ error:
849 */ 849 */
850struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, 850struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
851 const struct sockaddr *ds_addr, int ds_addrlen, 851 const struct sockaddr *ds_addr, int ds_addrlen,
852 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) 852 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
853 u32 minor_version, rpc_authflavor_t au_flavor)
853{ 854{
854 struct nfs_client_initdata cl_init = { 855 struct nfs_client_initdata cl_init = {
855 .addr = ds_addr, 856 .addr = ds_addr,
856 .addrlen = ds_addrlen, 857 .addrlen = ds_addrlen,
857 .nfs_mod = &nfs_v4, 858 .nfs_mod = &nfs_v4,
858 .proto = ds_proto, 859 .proto = ds_proto,
859 .minorversion = mds_clp->cl_minorversion, 860 .minorversion = minor_version,
860 .net = mds_clp->cl_net, 861 .net = mds_clp->cl_net,
861 }; 862 };
862 struct rpc_timeout ds_timeout; 863 struct rpc_timeout ds_timeout;
@@ -874,7 +875,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
874 */ 875 */
875 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); 876 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
876 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, 877 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
877 mds_clp->cl_rpcclient->cl_auth->au_flavor); 878 au_flavor);
878 879
879 dprintk("<-- %s %p\n", __func__, clp); 880 dprintk("<-- %s %p\n", __func__, clp);
880 return clp; 881 return clp;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c347705b0161..2e7c9f7a6f7c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -495,12 +495,11 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
495 args->sa_privileged = 1; 495 args->sa_privileged = 1;
496} 496}
497 497
498static int nfs40_setup_sequence(const struct nfs_server *server, 498int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
499 struct nfs4_sequence_args *args, 499 struct nfs4_sequence_args *args,
500 struct nfs4_sequence_res *res, 500 struct nfs4_sequence_res *res,
501 struct rpc_task *task) 501 struct rpc_task *task)
502{ 502{
503 struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
504 struct nfs4_slot *slot; 503 struct nfs4_slot *slot;
505 504
506 /* slot already allocated? */ 505 /* slot already allocated? */
@@ -535,6 +534,7 @@ out_sleep:
535 spin_unlock(&tbl->slot_tbl_lock); 534 spin_unlock(&tbl->slot_tbl_lock);
536 return -EAGAIN; 535 return -EAGAIN;
537} 536}
537EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
538 538
539static int nfs40_sequence_done(struct rpc_task *task, 539static int nfs40_sequence_done(struct rpc_task *task,
540 struct nfs4_sequence_res *res) 540 struct nfs4_sequence_res *res)
@@ -694,8 +694,7 @@ out_retry:
694} 694}
695EXPORT_SYMBOL_GPL(nfs41_sequence_done); 695EXPORT_SYMBOL_GPL(nfs41_sequence_done);
696 696
697static int nfs4_sequence_done(struct rpc_task *task, 697int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
698 struct nfs4_sequence_res *res)
699{ 698{
700 if (res->sr_slot == NULL) 699 if (res->sr_slot == NULL)
701 return 1; 700 return 1;
@@ -703,6 +702,7 @@ static int nfs4_sequence_done(struct rpc_task *task,
703 return nfs40_sequence_done(task, res); 702 return nfs40_sequence_done(task, res);
704 return nfs41_sequence_done(task, res); 703 return nfs41_sequence_done(task, res);
705} 704}
705EXPORT_SYMBOL_GPL(nfs4_sequence_done);
706 706
707int nfs41_setup_sequence(struct nfs4_session *session, 707int nfs41_setup_sequence(struct nfs4_session *session,
708 struct nfs4_sequence_args *args, 708 struct nfs4_sequence_args *args,
@@ -777,7 +777,8 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
777 int ret = 0; 777 int ret = 0;
778 778
779 if (!session) 779 if (!session)
780 return nfs40_setup_sequence(server, args, res, task); 780 return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
781 args, res, task);
781 782
782 dprintk("--> %s clp %p session %p sr_slot %u\n", 783 dprintk("--> %s clp %p session %p sr_slot %u\n",
783 __func__, session->clp, session, res->sr_slot ? 784 __func__, session->clp, session, res->sr_slot ?
@@ -818,14 +819,16 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
818 struct nfs4_sequence_res *res, 819 struct nfs4_sequence_res *res,
819 struct rpc_task *task) 820 struct rpc_task *task)
820{ 821{
821 return nfs40_setup_sequence(server, args, res, task); 822 return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
823 args, res, task);
822} 824}
823 825
824static int nfs4_sequence_done(struct rpc_task *task, 826int nfs4_sequence_done(struct rpc_task *task,
825 struct nfs4_sequence_res *res) 827 struct nfs4_sequence_res *res)
826{ 828{
827 return nfs40_sequence_done(task, res); 829 return nfs40_sequence_done(task, res);
828} 830}
831EXPORT_SYMBOL_GPL(nfs4_sequence_done);
829 832
830#endif /* !CONFIG_NFS_V4_1 */ 833#endif /* !CONFIG_NFS_V4_1 */
831 834
@@ -937,6 +940,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
937 return true; 940 return true;
938} 941}
939 942
943static u32
944nfs4_map_atomic_open_share(struct nfs_server *server,
945 fmode_t fmode, int openflags)
946{
947 u32 res = 0;
948
949 switch (fmode & (FMODE_READ | FMODE_WRITE)) {
950 case FMODE_READ:
951 res = NFS4_SHARE_ACCESS_READ;
952 break;
953 case FMODE_WRITE:
954 res = NFS4_SHARE_ACCESS_WRITE;
955 break;
956 case FMODE_READ|FMODE_WRITE:
957 res = NFS4_SHARE_ACCESS_BOTH;
958 }
959 if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
960 goto out;
961 /* Want no delegation if we're using O_DIRECT */
962 if (openflags & O_DIRECT)
963 res |= NFS4_SHARE_WANT_NO_DELEG;
964out:
965 return res;
966}
967
940static enum open_claim_type4 968static enum open_claim_type4
941nfs4_map_atomic_open_claim(struct nfs_server *server, 969nfs4_map_atomic_open_claim(struct nfs_server *server,
942 enum open_claim_type4 claim) 970 enum open_claim_type4 claim)
@@ -977,6 +1005,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
977 struct dentry *parent = dget_parent(dentry); 1005 struct dentry *parent = dget_parent(dentry);
978 struct inode *dir = parent->d_inode; 1006 struct inode *dir = parent->d_inode;
979 struct nfs_server *server = NFS_SERVER(dir); 1007 struct nfs_server *server = NFS_SERVER(dir);
1008 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
980 struct nfs4_opendata *p; 1009 struct nfs4_opendata *p;
981 1010
982 p = kzalloc(sizeof(*p), gfp_mask); 1011 p = kzalloc(sizeof(*p), gfp_mask);
@@ -987,8 +1016,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
987 if (IS_ERR(p->f_label)) 1016 if (IS_ERR(p->f_label))
988 goto err_free_p; 1017 goto err_free_p;
989 1018
990 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); 1019 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
991 if (p->o_arg.seqid == NULL) 1020 p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
1021 if (IS_ERR(p->o_arg.seqid))
992 goto err_free_label; 1022 goto err_free_label;
993 nfs_sb_active(dentry->d_sb); 1023 nfs_sb_active(dentry->d_sb);
994 p->dentry = dget(dentry); 1024 p->dentry = dget(dentry);
@@ -997,6 +1027,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
997 atomic_inc(&sp->so_count); 1027 atomic_inc(&sp->so_count);
998 p->o_arg.open_flags = flags; 1028 p->o_arg.open_flags = flags;
999 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); 1029 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
1030 p->o_arg.share_access = nfs4_map_atomic_open_share(server,
1031 fmode, flags);
1000 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS 1032 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
1001 * will return permission denied for all bits until close */ 1033 * will return permission denied for all bits until close */
1002 if (!(flags & O_EXCL)) { 1034 if (!(flags & O_EXCL)) {
@@ -1167,6 +1199,16 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state,
1167 return false; 1199 return false;
1168} 1200}
1169 1201
1202static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
1203{
1204 if (state->n_wronly)
1205 set_bit(NFS_O_WRONLY_STATE, &state->flags);
1206 if (state->n_rdonly)
1207 set_bit(NFS_O_RDONLY_STATE, &state->flags);
1208 if (state->n_rdwr)
1209 set_bit(NFS_O_RDWR_STATE, &state->flags);
1210}
1211
1170static void nfs_clear_open_stateid_locked(struct nfs4_state *state, 1212static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1171 nfs4_stateid *stateid, fmode_t fmode) 1213 nfs4_stateid *stateid, fmode_t fmode)
1172{ 1214{
@@ -1185,8 +1227,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1185 } 1227 }
1186 if (stateid == NULL) 1228 if (stateid == NULL)
1187 return; 1229 return;
1188 if (!nfs_need_update_open_stateid(state, stateid)) 1230 /* Handle races with OPEN */
1231 if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
1232 !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
1233 nfs_resync_open_stateid_locked(state);
1189 return; 1234 return;
1235 }
1190 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1236 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1191 nfs4_stateid_copy(&state->stateid, stateid); 1237 nfs4_stateid_copy(&state->stateid, stateid);
1192 nfs4_stateid_copy(&state->open_stateid, stateid); 1238 nfs4_stateid_copy(&state->open_stateid, stateid);
@@ -1281,6 +1327,23 @@ no_delegation:
1281 return ret; 1327 return ret;
1282} 1328}
1283 1329
1330static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
1331 const nfs4_stateid *stateid)
1332{
1333 struct nfs4_state *state = lsp->ls_state;
1334 bool ret = false;
1335
1336 spin_lock(&state->state_lock);
1337 if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
1338 goto out_noupdate;
1339 if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
1340 goto out_noupdate;
1341 nfs4_stateid_copy(&lsp->ls_stateid, stateid);
1342 ret = true;
1343out_noupdate:
1344 spin_unlock(&state->state_lock);
1345 return ret;
1346}
1284 1347
1285static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) 1348static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
1286{ 1349{
@@ -1679,8 +1742,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
1679{ 1742{
1680 struct nfs4_opendata *data = calldata; 1743 struct nfs4_opendata *data = calldata;
1681 1744
1682 nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, 1745 nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl,
1683 &data->c_res.seq_res, task); 1746 &data->c_arg.seq_args, &data->c_res.seq_res, task);
1684} 1747}
1685 1748
1686static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) 1749static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -2587,6 +2650,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2587 case -NFS4ERR_OLD_STATEID: 2650 case -NFS4ERR_OLD_STATEID:
2588 case -NFS4ERR_BAD_STATEID: 2651 case -NFS4ERR_BAD_STATEID:
2589 case -NFS4ERR_EXPIRED: 2652 case -NFS4ERR_EXPIRED:
2653 if (!nfs4_stateid_match(&calldata->arg.stateid,
2654 &state->stateid)) {
2655 rpc_restart_call_prepare(task);
2656 goto out_release;
2657 }
2590 if (calldata->arg.fmode == 0) 2658 if (calldata->arg.fmode == 0)
2591 break; 2659 break;
2592 default: 2660 default:
@@ -2619,6 +2687,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2619 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); 2687 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
2620 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); 2688 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
2621 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); 2689 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
2690 nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
2622 /* Calculate the change in open mode */ 2691 /* Calculate the change in open mode */
2623 calldata->arg.fmode = 0; 2692 calldata->arg.fmode = 0;
2624 if (state->n_rdwr == 0) { 2693 if (state->n_rdwr == 0) {
@@ -2653,6 +2722,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2653 goto out_wait; 2722 goto out_wait;
2654 } 2723 }
2655 } 2724 }
2725 calldata->arg.share_access =
2726 nfs4_map_atomic_open_share(NFS_SERVER(inode),
2727 calldata->arg.fmode, 0);
2656 2728
2657 nfs_fattr_init(calldata->res.fattr); 2729 nfs_fattr_init(calldata->res.fattr);
2658 calldata->timestamp = jiffies; 2730 calldata->timestamp = jiffies;
@@ -2675,45 +2747,10 @@ static const struct rpc_call_ops nfs4_close_ops = {
2675 .rpc_release = nfs4_free_closedata, 2747 .rpc_release = nfs4_free_closedata,
2676}; 2748};
2677 2749
2678static bool nfs4_state_has_opener(struct nfs4_state *state)
2679{
2680 /* first check existing openers */
2681 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
2682 state->n_rdonly != 0)
2683 return true;
2684
2685 if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
2686 state->n_wronly != 0)
2687 return true;
2688
2689 if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
2690 state->n_rdwr != 0)
2691 return true;
2692
2693 return false;
2694}
2695
2696static bool nfs4_roc(struct inode *inode) 2750static bool nfs4_roc(struct inode *inode)
2697{ 2751{
2698 struct nfs_inode *nfsi = NFS_I(inode); 2752 if (!nfs_have_layout(inode))
2699 struct nfs_open_context *ctx;
2700 struct nfs4_state *state;
2701
2702 spin_lock(&inode->i_lock);
2703 list_for_each_entry(ctx, &nfsi->open_files, list) {
2704 state = ctx->state;
2705 if (state == NULL)
2706 continue;
2707 if (nfs4_state_has_opener(state)) {
2708 spin_unlock(&inode->i_lock);
2709 return false;
2710 }
2711 }
2712 spin_unlock(&inode->i_lock);
2713
2714 if (nfs4_check_delegation(inode, FMODE_READ))
2715 return false; 2753 return false;
2716
2717 return pnfs_roc(inode); 2754 return pnfs_roc(inode);
2718} 2755}
2719 2756
@@ -2731,6 +2768,7 @@ static bool nfs4_roc(struct inode *inode)
2731int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) 2768int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2732{ 2769{
2733 struct nfs_server *server = NFS_SERVER(state->inode); 2770 struct nfs_server *server = NFS_SERVER(state->inode);
2771 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
2734 struct nfs4_closedata *calldata; 2772 struct nfs4_closedata *calldata;
2735 struct nfs4_state_owner *sp = state->owner; 2773 struct nfs4_state_owner *sp = state->owner;
2736 struct rpc_task *task; 2774 struct rpc_task *task;
@@ -2757,10 +2795,10 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2757 calldata->inode = state->inode; 2795 calldata->inode = state->inode;
2758 calldata->state = state; 2796 calldata->state = state;
2759 calldata->arg.fh = NFS_FH(state->inode); 2797 calldata->arg.fh = NFS_FH(state->inode);
2760 calldata->arg.stateid = &state->open_stateid;
2761 /* Serialization for the sequence id */ 2798 /* Serialization for the sequence id */
2762 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); 2799 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
2763 if (calldata->arg.seqid == NULL) 2800 calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
2801 if (IS_ERR(calldata->arg.seqid))
2764 goto out_free_calldata; 2802 goto out_free_calldata;
2765 calldata->arg.fmode = 0; 2803 calldata->arg.fmode = 0;
2766 calldata->arg.bitmask = server->cache_consistency_bitmask; 2804 calldata->arg.bitmask = server->cache_consistency_bitmask;
@@ -5137,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5137static void nfs4_delegreturn_release(void *calldata) 5175static void nfs4_delegreturn_release(void *calldata)
5138{ 5176{
5139 struct nfs4_delegreturndata *data = calldata; 5177 struct nfs4_delegreturndata *data = calldata;
5178 struct inode *inode = data->inode;
5140 5179
5141 if (data->roc) 5180 if (inode) {
5142 pnfs_roc_release(data->inode); 5181 if (data->roc)
5182 pnfs_roc_release(inode);
5183 nfs_iput_and_deactive(inode);
5184 }
5143 kfree(calldata); 5185 kfree(calldata);
5144} 5186}
5145 5187
@@ -5196,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
5196 nfs_fattr_init(data->res.fattr); 5238 nfs_fattr_init(data->res.fattr);
5197 data->timestamp = jiffies; 5239 data->timestamp = jiffies;
5198 data->rpc_status = 0; 5240 data->rpc_status = 0;
5199 data->inode = inode; 5241 data->inode = nfs_igrab_and_active(inode);
5200 data->roc = list_empty(&NFS_I(inode)->open_files) ? 5242 if (data->inode)
5201 pnfs_roc(inode) : false; 5243 data->roc = nfs4_roc(inode);
5202 5244
5203 task_setup_data.callback_data = data; 5245 task_setup_data.callback_data = data;
5204 msg.rpc_argp = &data->args; 5246 msg.rpc_argp = &data->args;
@@ -5353,7 +5395,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
5353 p->arg.fl = &p->fl; 5395 p->arg.fl = &p->fl;
5354 p->arg.seqid = seqid; 5396 p->arg.seqid = seqid;
5355 p->res.seqid = seqid; 5397 p->res.seqid = seqid;
5356 p->arg.stateid = &lsp->ls_stateid;
5357 p->lsp = lsp; 5398 p->lsp = lsp;
5358 atomic_inc(&lsp->ls_count); 5399 atomic_inc(&lsp->ls_count);
5359 /* Ensure we don't close file until we're done freeing locks! */ 5400 /* Ensure we don't close file until we're done freeing locks! */
@@ -5380,14 +5421,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
5380 return; 5421 return;
5381 switch (task->tk_status) { 5422 switch (task->tk_status) {
5382 case 0: 5423 case 0:
5383 nfs4_stateid_copy(&calldata->lsp->ls_stateid,
5384 &calldata->res.stateid);
5385 renew_lease(calldata->server, calldata->timestamp); 5424 renew_lease(calldata->server, calldata->timestamp);
5386 break; 5425 do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
5426 if (nfs4_update_lock_stateid(calldata->lsp,
5427 &calldata->res.stateid))
5428 break;
5387 case -NFS4ERR_BAD_STATEID: 5429 case -NFS4ERR_BAD_STATEID:
5388 case -NFS4ERR_OLD_STATEID: 5430 case -NFS4ERR_OLD_STATEID:
5389 case -NFS4ERR_STALE_STATEID: 5431 case -NFS4ERR_STALE_STATEID:
5390 case -NFS4ERR_EXPIRED: 5432 case -NFS4ERR_EXPIRED:
5433 if (!nfs4_stateid_match(&calldata->arg.stateid,
5434 &calldata->lsp->ls_stateid))
5435 rpc_restart_call_prepare(task);
5391 break; 5436 break;
5392 default: 5437 default:
5393 if (nfs4_async_handle_error(task, calldata->server, 5438 if (nfs4_async_handle_error(task, calldata->server,
@@ -5403,6 +5448,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
5403 5448
5404 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 5449 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
5405 goto out_wait; 5450 goto out_wait;
5451 nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
5406 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { 5452 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
5407 /* Note: exit _without_ running nfs4_locku_done */ 5453 /* Note: exit _without_ running nfs4_locku_done */
5408 goto out_no_action; 5454 goto out_no_action;
@@ -5473,6 +5519,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
5473 struct nfs_seqid *seqid; 5519 struct nfs_seqid *seqid;
5474 struct nfs4_lock_state *lsp; 5520 struct nfs4_lock_state *lsp;
5475 struct rpc_task *task; 5521 struct rpc_task *task;
5522 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
5476 int status = 0; 5523 int status = 0;
5477 unsigned char fl_flags = request->fl_flags; 5524 unsigned char fl_flags = request->fl_flags;
5478 5525
@@ -5496,9 +5543,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
5496 lsp = request->fl_u.nfs4_fl.owner; 5543 lsp = request->fl_u.nfs4_fl.owner;
5497 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) 5544 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
5498 goto out; 5545 goto out;
5499 seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); 5546 alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
5547 seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
5500 status = -ENOMEM; 5548 status = -ENOMEM;
5501 if (seqid == NULL) 5549 if (IS_ERR(seqid))
5502 goto out; 5550 goto out;
5503 task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); 5551 task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
5504 status = PTR_ERR(task); 5552 status = PTR_ERR(task);
@@ -5531,6 +5579,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
5531 struct nfs4_lockdata *p; 5579 struct nfs4_lockdata *p;
5532 struct inode *inode = lsp->ls_state->inode; 5580 struct inode *inode = lsp->ls_state->inode;
5533 struct nfs_server *server = NFS_SERVER(inode); 5581 struct nfs_server *server = NFS_SERVER(inode);
5582 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
5534 5583
5535 p = kzalloc(sizeof(*p), gfp_mask); 5584 p = kzalloc(sizeof(*p), gfp_mask);
5536 if (p == NULL) 5585 if (p == NULL)
@@ -5539,12 +5588,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
5539 p->arg.fh = NFS_FH(inode); 5588 p->arg.fh = NFS_FH(inode);
5540 p->arg.fl = &p->fl; 5589 p->arg.fl = &p->fl;
5541 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); 5590 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
5542 if (p->arg.open_seqid == NULL) 5591 if (IS_ERR(p->arg.open_seqid))
5543 goto out_free; 5592 goto out_free;
5544 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); 5593 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
5545 if (p->arg.lock_seqid == NULL) 5594 p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
5595 if (IS_ERR(p->arg.lock_seqid))
5546 goto out_free_seqid; 5596 goto out_free_seqid;
5547 p->arg.lock_stateid = &lsp->ls_stateid;
5548 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 5597 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
5549 p->arg.lock_owner.id = lsp->ls_seqid.owner_id; 5598 p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
5550 p->arg.lock_owner.s_dev = server->s_dev; 5599 p->arg.lock_owner.s_dev = server->s_dev;
@@ -5571,15 +5620,19 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
5571 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) 5620 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
5572 goto out_wait; 5621 goto out_wait;
5573 /* Do we need to do an open_to_lock_owner? */ 5622 /* Do we need to do an open_to_lock_owner? */
5574 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { 5623 if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
5575 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { 5624 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
5576 goto out_release_lock_seqid; 5625 goto out_release_lock_seqid;
5577 } 5626 }
5578 data->arg.open_stateid = &state->open_stateid; 5627 nfs4_stateid_copy(&data->arg.open_stateid,
5628 &state->open_stateid);
5579 data->arg.new_lock_owner = 1; 5629 data->arg.new_lock_owner = 1;
5580 data->res.open_seqid = data->arg.open_seqid; 5630 data->res.open_seqid = data->arg.open_seqid;
5581 } else 5631 } else {
5582 data->arg.new_lock_owner = 0; 5632 data->arg.new_lock_owner = 0;
5633 nfs4_stateid_copy(&data->arg.lock_stateid,
5634 &data->lsp->ls_stateid);
5635 }
5583 if (!nfs4_valid_open_stateid(state)) { 5636 if (!nfs4_valid_open_stateid(state)) {
5584 data->rpc_status = -EBADF; 5637 data->rpc_status = -EBADF;
5585 task->tk_action = NULL; 5638 task->tk_action = NULL;
@@ -5603,6 +5656,7 @@ out_wait:
5603static void nfs4_lock_done(struct rpc_task *task, void *calldata) 5656static void nfs4_lock_done(struct rpc_task *task, void *calldata)
5604{ 5657{
5605 struct nfs4_lockdata *data = calldata; 5658 struct nfs4_lockdata *data = calldata;
5659 struct nfs4_lock_state *lsp = data->lsp;
5606 5660
5607 dprintk("%s: begin!\n", __func__); 5661 dprintk("%s: begin!\n", __func__);
5608 5662
@@ -5610,18 +5664,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
5610 return; 5664 return;
5611 5665
5612 data->rpc_status = task->tk_status; 5666 data->rpc_status = task->tk_status;
5613 if (data->arg.new_lock_owner != 0) { 5667 switch (task->tk_status) {
5614 if (data->rpc_status == 0) 5668 case 0:
5615 nfs_confirm_seqid(&data->lsp->ls_seqid, 0); 5669 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
5616 else 5670 data->timestamp);
5617 goto out; 5671 if (data->arg.new_lock) {
5618 } 5672 data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
5619 if (data->rpc_status == 0) { 5673 if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
5620 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); 5674 rpc_restart_call_prepare(task);
5621 set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); 5675 break;
5622 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); 5676 }
5677 }
5678 if (data->arg.new_lock_owner != 0) {
5679 nfs_confirm_seqid(&lsp->ls_seqid, 0);
5680 nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
5681 set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
5682 } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
5683 rpc_restart_call_prepare(task);
5684 break;
5685 case -NFS4ERR_BAD_STATEID:
5686 case -NFS4ERR_OLD_STATEID:
5687 case -NFS4ERR_STALE_STATEID:
5688 case -NFS4ERR_EXPIRED:
5689 if (data->arg.new_lock_owner != 0) {
5690 if (!nfs4_stateid_match(&data->arg.open_stateid,
5691 &lsp->ls_state->open_stateid))
5692 rpc_restart_call_prepare(task);
5693 } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
5694 &lsp->ls_stateid))
5695 rpc_restart_call_prepare(task);
5623 } 5696 }
5624out:
5625 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); 5697 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
5626} 5698}
5627 5699
@@ -5702,7 +5774,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
5702 if (recovery_type == NFS_LOCK_RECLAIM) 5774 if (recovery_type == NFS_LOCK_RECLAIM)
5703 data->arg.reclaim = NFS_LOCK_RECLAIM; 5775 data->arg.reclaim = NFS_LOCK_RECLAIM;
5704 nfs4_set_sequence_privileged(&data->arg.seq_args); 5776 nfs4_set_sequence_privileged(&data->arg.seq_args);
5705 } 5777 } else
5778 data->arg.new_lock = 1;
5706 task = rpc_run_task(&task_setup_data); 5779 task = rpc_run_task(&task_setup_data);
5707 if (IS_ERR(task)) 5780 if (IS_ERR(task))
5708 return PTR_ERR(task); 5781 return PTR_ERR(task);
@@ -5826,10 +5899,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
5826 5899
5827static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 5900static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
5828{ 5901{
5829 struct nfs4_state_owner *sp = state->owner;
5830 struct nfs_inode *nfsi = NFS_I(state->inode); 5902 struct nfs_inode *nfsi = NFS_I(state->inode);
5831 unsigned char fl_flags = request->fl_flags; 5903 unsigned char fl_flags = request->fl_flags;
5832 unsigned int seq;
5833 int status = -ENOLCK; 5904 int status = -ENOLCK;
5834 5905
5835 if ((fl_flags & FL_POSIX) && 5906 if ((fl_flags & FL_POSIX) &&
@@ -5849,25 +5920,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
5849 /* ...but avoid races with delegation recall... */ 5920 /* ...but avoid races with delegation recall... */
5850 request->fl_flags = fl_flags & ~FL_SLEEP; 5921 request->fl_flags = fl_flags & ~FL_SLEEP;
5851 status = do_vfs_lock(request->fl_file, request); 5922 status = do_vfs_lock(request->fl_file, request);
5852 goto out_unlock; 5923 up_read(&nfsi->rwsem);
5853 }
5854 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5855 up_read(&nfsi->rwsem);
5856 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
5857 if (status != 0)
5858 goto out; 5924 goto out;
5859 down_read(&nfsi->rwsem);
5860 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
5861 status = -NFS4ERR_DELAY;
5862 goto out_unlock;
5863 } 5925 }
5864 /* Note: we always want to sleep here! */
5865 request->fl_flags = fl_flags | FL_SLEEP;
5866 if (do_vfs_lock(request->fl_file, request) < 0)
5867 printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
5868 "manager!\n", __func__);
5869out_unlock:
5870 up_read(&nfsi->rwsem); 5926 up_read(&nfsi->rwsem);
5927 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
5871out: 5928out:
5872 request->fl_flags = fl_flags; 5929 request->fl_flags = fl_flags;
5873 return status; 5930 return status;
@@ -5974,8 +6031,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
5974{ 6031{
5975 struct nfs_release_lockowner_data *data = calldata; 6032 struct nfs_release_lockowner_data *data = calldata;
5976 struct nfs_server *server = data->server; 6033 struct nfs_server *server = data->server;
5977 nfs40_setup_sequence(server, &data->args.seq_args, 6034 nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
5978 &data->res.seq_res, task); 6035 &data->args.seq_args, &data->res.seq_res, task);
5979 data->args.lock_owner.clientid = server->nfs_client->cl_clientid; 6036 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
5980 data->timestamp = jiffies; 6037 data->timestamp = jiffies;
5981} 6038}
@@ -7537,6 +7594,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
7537 return; 7594 return;
7538 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, 7595 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
7539 NFS_I(lgp->args.inode)->layout, 7596 NFS_I(lgp->args.inode)->layout,
7597 &lgp->args.range,
7540 lgp->args.ctx->state)) { 7598 lgp->args.ctx->state)) {
7541 rpc_exit(task, NFS4_OK); 7599 rpc_exit(task, NFS4_OK);
7542 } 7600 }
@@ -7792,9 +7850,13 @@ static void nfs4_layoutreturn_release(void *calldata)
7792 spin_lock(&lo->plh_inode->i_lock); 7850 spin_lock(&lo->plh_inode->i_lock);
7793 if (lrp->res.lrs_present) 7851 if (lrp->res.lrs_present)
7794 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); 7852 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
7853 pnfs_clear_layoutreturn_waitbit(lo);
7854 clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
7855 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
7795 lo->plh_block_lgets--; 7856 lo->plh_block_lgets--;
7796 spin_unlock(&lo->plh_inode->i_lock); 7857 spin_unlock(&lo->plh_inode->i_lock);
7797 pnfs_put_layout_hdr(lrp->args.layout); 7858 pnfs_put_layout_hdr(lrp->args.layout);
7859 nfs_iput_and_deactive(lrp->inode);
7798 kfree(calldata); 7860 kfree(calldata);
7799 dprintk("<-- %s\n", __func__); 7861 dprintk("<-- %s\n", __func__);
7800} 7862}
@@ -7805,7 +7867,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
7805 .rpc_release = nfs4_layoutreturn_release, 7867 .rpc_release = nfs4_layoutreturn_release,
7806}; 7868};
7807 7869
7808int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) 7870int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
7809{ 7871{
7810 struct rpc_task *task; 7872 struct rpc_task *task;
7811 struct rpc_message msg = { 7873 struct rpc_message msg = {
@@ -7820,14 +7882,23 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
7820 .callback_ops = &nfs4_layoutreturn_call_ops, 7882 .callback_ops = &nfs4_layoutreturn_call_ops,
7821 .callback_data = lrp, 7883 .callback_data = lrp,
7822 }; 7884 };
7823 int status; 7885 int status = 0;
7824 7886
7825 dprintk("--> %s\n", __func__); 7887 dprintk("--> %s\n", __func__);
7888 if (!sync) {
7889 lrp->inode = nfs_igrab_and_active(lrp->args.inode);
7890 if (!lrp->inode) {
7891 nfs4_layoutreturn_release(lrp);
7892 return -EAGAIN;
7893 }
7894 task_setup_data.flags |= RPC_TASK_ASYNC;
7895 }
7826 nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); 7896 nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
7827 task = rpc_run_task(&task_setup_data); 7897 task = rpc_run_task(&task_setup_data);
7828 if (IS_ERR(task)) 7898 if (IS_ERR(task))
7829 return PTR_ERR(task); 7899 return PTR_ERR(task);
7830 status = task->tk_status; 7900 if (sync)
7901 status = task->tk_status;
7831 trace_nfs4_layoutreturn(lrp->args.inode, status); 7902 trace_nfs4_layoutreturn(lrp->args.inode, status);
7832 dprintk("<-- %s status=%d\n", __func__, status); 7903 dprintk("<-- %s status=%d\n", __func__, status);
7833 rpc_put_task(task); 7904 rpc_put_task(task);
@@ -7921,6 +7992,7 @@ static void nfs4_layoutcommit_release(void *calldata)
7921 nfs_post_op_update_inode_force_wcc(data->args.inode, 7992 nfs_post_op_update_inode_force_wcc(data->args.inode,
7922 data->res.fattr); 7993 data->res.fattr);
7923 put_rpccred(data->cred); 7994 put_rpccred(data->cred);
7995 nfs_iput_and_deactive(data->inode);
7924 kfree(data); 7996 kfree(data);
7925} 7997}
7926 7998
@@ -7945,7 +8017,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7945 .rpc_message = &msg, 8017 .rpc_message = &msg,
7946 .callback_ops = &nfs4_layoutcommit_ops, 8018 .callback_ops = &nfs4_layoutcommit_ops,
7947 .callback_data = data, 8019 .callback_data = data,
7948 .flags = RPC_TASK_ASYNC,
7949 }; 8020 };
7950 struct rpc_task *task; 8021 struct rpc_task *task;
7951 int status = 0; 8022 int status = 0;
@@ -7956,18 +8027,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7956 data->args.lastbytewritten, 8027 data->args.lastbytewritten,
7957 data->args.inode->i_ino); 8028 data->args.inode->i_ino);
7958 8029
8030 if (!sync) {
8031 data->inode = nfs_igrab_and_active(data->args.inode);
8032 if (data->inode == NULL) {
8033 nfs4_layoutcommit_release(data);
8034 return -EAGAIN;
8035 }
8036 task_setup_data.flags = RPC_TASK_ASYNC;
8037 }
7959 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 8038 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
7960 task = rpc_run_task(&task_setup_data); 8039 task = rpc_run_task(&task_setup_data);
7961 if (IS_ERR(task)) 8040 if (IS_ERR(task))
7962 return PTR_ERR(task); 8041 return PTR_ERR(task);
7963 if (sync == false) 8042 if (sync)
7964 goto out; 8043 status = task->tk_status;
7965 status = nfs4_wait_for_completion_rpc_task(task);
7966 if (status != 0)
7967 goto out;
7968 status = task->tk_status;
7969 trace_nfs4_layoutcommit(data->args.inode, status); 8044 trace_nfs4_layoutcommit(data->args.inode, status);
7970out:
7971 dprintk("%s: status %d\n", __func__, status); 8045 dprintk("%s: status %d\n", __func__, status);
7972 rpc_put_task(task); 8046 rpc_put_task(task);
7973 return status; 8047 return status;
@@ -8395,6 +8469,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
8395 .match_stateid = nfs4_match_stateid, 8469 .match_stateid = nfs4_match_stateid,
8396 .find_root_sec = nfs4_find_root_sec, 8470 .find_root_sec = nfs4_find_root_sec,
8397 .free_lock_state = nfs4_release_lockowner, 8471 .free_lock_state = nfs4_release_lockowner,
8472 .alloc_seqid = nfs_alloc_seqid,
8398 .call_sync_ops = &nfs40_call_sync_ops, 8473 .call_sync_ops = &nfs40_call_sync_ops,
8399 .reboot_recovery_ops = &nfs40_reboot_recovery_ops, 8474 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
8400 .nograce_recovery_ops = &nfs40_nograce_recovery_ops, 8475 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -8403,6 +8478,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
8403}; 8478};
8404 8479
8405#if defined(CONFIG_NFS_V4_1) 8480#if defined(CONFIG_NFS_V4_1)
8481static struct nfs_seqid *
8482nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
8483{
8484 return NULL;
8485}
8486
8406static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { 8487static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8407 .minor_version = 1, 8488 .minor_version = 1,
8408 .init_caps = NFS_CAP_READDIRPLUS 8489 .init_caps = NFS_CAP_READDIRPLUS
@@ -8416,6 +8497,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8416 .match_stateid = nfs41_match_stateid, 8497 .match_stateid = nfs41_match_stateid,
8417 .find_root_sec = nfs41_find_root_sec, 8498 .find_root_sec = nfs41_find_root_sec,
8418 .free_lock_state = nfs41_free_lock_state, 8499 .free_lock_state = nfs41_free_lock_state,
8500 .alloc_seqid = nfs_alloc_no_seqid,
8419 .call_sync_ops = &nfs41_call_sync_ops, 8501 .call_sync_ops = &nfs41_call_sync_ops,
8420 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 8502 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
8421 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 8503 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8442,6 +8524,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
8442 .find_root_sec = nfs41_find_root_sec, 8524 .find_root_sec = nfs41_find_root_sec,
8443 .free_lock_state = nfs41_free_lock_state, 8525 .free_lock_state = nfs41_free_lock_state,
8444 .call_sync_ops = &nfs41_call_sync_ops, 8526 .call_sync_ops = &nfs41_call_sync_ops,
8527 .alloc_seqid = nfs_alloc_no_seqid,
8445 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 8528 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
8446 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 8529 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
8447 .state_renewal_ops = &nfs41_state_renewal_ops, 8530 .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a3bb22ab68c5..5ad908e9ce9c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1003,11 +1003,11 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
1003 struct nfs_seqid *new; 1003 struct nfs_seqid *new;
1004 1004
1005 new = kmalloc(sizeof(*new), gfp_mask); 1005 new = kmalloc(sizeof(*new), gfp_mask);
1006 if (new != NULL) { 1006 if (new == NULL)
1007 new->sequence = counter; 1007 return ERR_PTR(-ENOMEM);
1008 INIT_LIST_HEAD(&new->list); 1008 new->sequence = counter;
1009 new->task = NULL; 1009 INIT_LIST_HEAD(&new->list);
1010 } 1010 new->task = NULL;
1011 return new; 1011 return new;
1012} 1012}
1013 1013
@@ -1015,7 +1015,7 @@ void nfs_release_seqid(struct nfs_seqid *seqid)
1015{ 1015{
1016 struct nfs_seqid_counter *sequence; 1016 struct nfs_seqid_counter *sequence;
1017 1017
1018 if (list_empty(&seqid->list)) 1018 if (seqid == NULL || list_empty(&seqid->list))
1019 return; 1019 return;
1020 sequence = seqid->sequence; 1020 sequence = seqid->sequence;
1021 spin_lock(&sequence->lock); 1021 spin_lock(&sequence->lock);
@@ -1071,13 +1071,15 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
1071 1071
1072void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) 1072void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
1073{ 1073{
1074 struct nfs4_state_owner *sp = container_of(seqid->sequence, 1074 struct nfs4_state_owner *sp;
1075 struct nfs4_state_owner, so_seqid); 1075
1076 struct nfs_server *server = sp->so_server; 1076 if (seqid == NULL)
1077 return;
1077 1078
1079 sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
1078 if (status == -NFS4ERR_BAD_SEQID) 1080 if (status == -NFS4ERR_BAD_SEQID)
1079 nfs4_drop_state_owner(sp); 1081 nfs4_drop_state_owner(sp);
1080 if (!nfs4_has_session(server->nfs_client)) 1082 if (!nfs4_has_session(sp->so_server->nfs_client))
1081 nfs_increment_seqid(status, seqid); 1083 nfs_increment_seqid(status, seqid);
1082} 1084}
1083 1085
@@ -1088,14 +1090,18 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
1088 */ 1090 */
1089void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) 1091void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
1090{ 1092{
1091 nfs_increment_seqid(status, seqid); 1093 if (seqid != NULL)
1094 nfs_increment_seqid(status, seqid);
1092} 1095}
1093 1096
1094int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) 1097int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
1095{ 1098{
1096 struct nfs_seqid_counter *sequence = seqid->sequence; 1099 struct nfs_seqid_counter *sequence;
1097 int status = 0; 1100 int status = 0;
1098 1101
1102 if (seqid == NULL)
1103 goto out;
1104 sequence = seqid->sequence;
1099 spin_lock(&sequence->lock); 1105 spin_lock(&sequence->lock);
1100 seqid->task = task; 1106 seqid->task = task;
1101 if (list_empty(&seqid->list)) 1107 if (list_empty(&seqid->list))
@@ -1106,6 +1112,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
1106 status = -EAGAIN; 1112 status = -EAGAIN;
1107unlock: 1113unlock:
1108 spin_unlock(&sequence->lock); 1114 spin_unlock(&sequence->lock);
1115out:
1109 return status; 1116 return status;
1110} 1117}
1111 1118
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 6f340f02f2ba..48cea3c30e5d 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -346,6 +346,9 @@ out:
346 346
347static void __exit exit_nfs_v4(void) 347static void __exit exit_nfs_v4(void)
348{ 348{
349 /* Not called in the _init(), conditionally loaded */
350 nfs4_pnfs_v3_ds_connect_unload();
351
349 unregister_nfs_version(&nfs_v4); 352 unregister_nfs_version(&nfs_v4);
350 nfs4_unregister_sysctl(); 353 nfs4_unregister_sysctl();
351 nfs_idmap_quit(); 354 nfs_idmap_quit();
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..e23a0a664e12 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -946,7 +946,10 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n)
946static void encode_nfs4_seqid(struct xdr_stream *xdr, 946static void encode_nfs4_seqid(struct xdr_stream *xdr,
947 const struct nfs_seqid *seqid) 947 const struct nfs_seqid *seqid)
948{ 948{
949 encode_uint32(xdr, seqid->sequence->counter); 949 if (seqid != NULL)
950 encode_uint32(xdr, seqid->sequence->counter);
951 else
952 encode_uint32(xdr, 0);
950} 953}
951 954
952static void encode_compound_hdr(struct xdr_stream *xdr, 955static void encode_compound_hdr(struct xdr_stream *xdr,
@@ -1125,7 +1128,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
1125{ 1128{
1126 encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); 1129 encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
1127 encode_nfs4_seqid(xdr, arg->seqid); 1130 encode_nfs4_seqid(xdr, arg->seqid);
1128 encode_nfs4_stateid(xdr, arg->stateid); 1131 encode_nfs4_stateid(xdr, &arg->stateid);
1129} 1132}
1130 1133
1131static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr) 1134static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
@@ -1301,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1301 *p = cpu_to_be32(args->new_lock_owner); 1304 *p = cpu_to_be32(args->new_lock_owner);
1302 if (args->new_lock_owner){ 1305 if (args->new_lock_owner){
1303 encode_nfs4_seqid(xdr, args->open_seqid); 1306 encode_nfs4_seqid(xdr, args->open_seqid);
1304 encode_nfs4_stateid(xdr, args->open_stateid); 1307 encode_nfs4_stateid(xdr, &args->open_stateid);
1305 encode_nfs4_seqid(xdr, args->lock_seqid); 1308 encode_nfs4_seqid(xdr, args->lock_seqid);
1306 encode_lockowner(xdr, &args->lock_owner); 1309 encode_lockowner(xdr, &args->lock_owner);
1307 } 1310 }
1308 else { 1311 else {
1309 encode_nfs4_stateid(xdr, args->lock_stateid); 1312 encode_nfs4_stateid(xdr, &args->lock_stateid);
1310 encode_nfs4_seqid(xdr, args->lock_seqid); 1313 encode_nfs4_seqid(xdr, args->lock_seqid);
1311 } 1314 }
1312} 1315}
@@ -1330,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1330 encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); 1333 encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
1331 encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); 1334 encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
1332 encode_nfs4_seqid(xdr, args->seqid); 1335 encode_nfs4_seqid(xdr, args->seqid);
1333 encode_nfs4_stateid(xdr, args->stateid); 1336 encode_nfs4_stateid(xdr, &args->stateid);
1334 p = reserve_space(xdr, 16); 1337 p = reserve_space(xdr, 16);
1335 p = xdr_encode_hyper(p, args->fl->fl_start); 1338 p = xdr_encode_hyper(p, args->fl->fl_start);
1336 xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1339 xdr_encode_hyper(p, nfs4_lock_length(args->fl));
@@ -1348,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
1348 encode_string(xdr, name->len, name->name); 1351 encode_string(xdr, name->len, name->name);
1349} 1352}
1350 1353
1351static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) 1354static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
1352{ 1355{
1353 __be32 *p; 1356 __be32 *p;
1354 1357
1355 p = reserve_space(xdr, 8); 1358 p = reserve_space(xdr, 8);
1356 switch (fmode & (FMODE_READ|FMODE_WRITE)) { 1359 *p++ = cpu_to_be32(share_access);
1357 case FMODE_READ:
1358 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
1359 break;
1360 case FMODE_WRITE:
1361 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
1362 break;
1363 case FMODE_READ|FMODE_WRITE:
1364 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
1365 break;
1366 default:
1367 *p++ = cpu_to_be32(0);
1368 }
1369 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ 1360 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
1370} 1361}
1371 1362
@@ -1377,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1377 * owner 4 = 32 1368 * owner 4 = 32
1378 */ 1369 */
1379 encode_nfs4_seqid(xdr, arg->seqid); 1370 encode_nfs4_seqid(xdr, arg->seqid);
1380 encode_share_access(xdr, arg->fmode); 1371 encode_share_access(xdr, arg->share_access);
1381 p = reserve_space(xdr, 36); 1372 p = reserve_space(xdr, 36);
1382 p = xdr_encode_hyper(p, arg->clientid); 1373 p = xdr_encode_hyper(p, arg->clientid);
1383 *p++ = cpu_to_be32(24); 1374 *p++ = cpu_to_be32(24);
@@ -1530,9 +1521,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
1530static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) 1521static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
1531{ 1522{
1532 encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); 1523 encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
1533 encode_nfs4_stateid(xdr, arg->stateid); 1524 encode_nfs4_stateid(xdr, &arg->stateid);
1534 encode_nfs4_seqid(xdr, arg->seqid); 1525 encode_nfs4_seqid(xdr, arg->seqid);
1535 encode_share_access(xdr, arg->fmode); 1526 encode_share_access(xdr, arg->share_access);
1536} 1527}
1537 1528
1538static void 1529static void
@@ -1801,9 +1792,8 @@ static void encode_create_session(struct xdr_stream *xdr,
1801 struct compound_hdr *hdr) 1792 struct compound_hdr *hdr)
1802{ 1793{
1803 __be32 *p; 1794 __be32 *p;
1804 char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
1805 uint32_t len;
1806 struct nfs_client *clp = args->client; 1795 struct nfs_client *clp = args->client;
1796 struct rpc_clnt *clnt = clp->cl_rpcclient;
1807 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); 1797 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
1808 u32 max_resp_sz_cached; 1798 u32 max_resp_sz_cached;
1809 1799
@@ -1814,11 +1804,8 @@ static void encode_create_session(struct xdr_stream *xdr,
1814 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 1804 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
1815 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; 1805 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
1816 1806
1817 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1818 clp->cl_ipaddr);
1819
1820 encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); 1807 encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
1821 p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12); 1808 p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
1822 p = xdr_encode_hyper(p, clp->cl_clientid); 1809 p = xdr_encode_hyper(p, clp->cl_clientid);
1823 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ 1810 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1824 *p++ = cpu_to_be32(args->flags); /*flags */ 1811 *p++ = cpu_to_be32(args->flags); /*flags */
@@ -1847,7 +1834,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1847 1834
1848 /* authsys_parms rfc1831 */ 1835 /* authsys_parms rfc1831 */
1849 *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ 1836 *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */
1850 p = xdr_encode_opaque(p, machine_name, len); 1837 p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
1851 *p++ = cpu_to_be32(0); /* UID */ 1838 *p++ = cpu_to_be32(0); /* UID */
1852 *p++ = cpu_to_be32(0); /* GID */ 1839 *p++ = cpu_to_be32(0); /* GID */
1853 *p = cpu_to_be32(0); /* No more gids */ 1840 *p = cpu_to_be32(0); /* No more gids */
@@ -2012,11 +1999,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
2012 p = reserve_space(xdr, 16); 1999 p = reserve_space(xdr, 16);
2013 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ 2000 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
2014 *p++ = cpu_to_be32(args->layout_type); 2001 *p++ = cpu_to_be32(args->layout_type);
2015 *p++ = cpu_to_be32(IOMODE_ANY); 2002 *p++ = cpu_to_be32(args->range.iomode);
2016 *p = cpu_to_be32(RETURN_FILE); 2003 *p = cpu_to_be32(RETURN_FILE);
2017 p = reserve_space(xdr, 16); 2004 p = reserve_space(xdr, 16);
2018 p = xdr_encode_hyper(p, 0); 2005 p = xdr_encode_hyper(p, args->range.offset);
2019 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); 2006 p = xdr_encode_hyper(p, args->range.length);
2020 spin_lock(&args->inode->i_lock); 2007 spin_lock(&args->inode->i_lock);
2021 encode_nfs4_stateid(xdr, &args->stateid); 2008 encode_nfs4_stateid(xdr, &args->stateid);
2022 spin_unlock(&args->inode->i_lock); 2009 spin_unlock(&args->inode->i_lock);
@@ -4936,20 +4923,13 @@ out_overflow:
4936 return -EIO; 4923 return -EIO;
4937} 4924}
4938 4925
4939static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 4926static int decode_rw_delegation(struct xdr_stream *xdr,
4927 uint32_t delegation_type,
4928 struct nfs_openres *res)
4940{ 4929{
4941 __be32 *p; 4930 __be32 *p;
4942 uint32_t delegation_type;
4943 int status; 4931 int status;
4944 4932
4945 p = xdr_inline_decode(xdr, 4);
4946 if (unlikely(!p))
4947 goto out_overflow;
4948 delegation_type = be32_to_cpup(p);
4949 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
4950 res->delegation_type = 0;
4951 return 0;
4952 }
4953 status = decode_stateid(xdr, &res->delegation); 4933 status = decode_stateid(xdr, &res->delegation);
4954 if (unlikely(status)) 4934 if (unlikely(status))
4955 return status; 4935 return status;
@@ -4973,6 +4953,52 @@ out_overflow:
4973 return -EIO; 4953 return -EIO;
4974} 4954}
4975 4955
4956static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
4957{
4958 __be32 *p;
4959 uint32_t why_no_delegation;
4960
4961 p = xdr_inline_decode(xdr, 4);
4962 if (unlikely(!p))
4963 goto out_overflow;
4964 why_no_delegation = be32_to_cpup(p);
4965 switch (why_no_delegation) {
4966 case WND4_CONTENTION:
4967 case WND4_RESOURCE:
4968 xdr_inline_decode(xdr, 4);
4969 /* Ignore for now */
4970 }
4971 return 0;
4972out_overflow:
4973 print_overflow_msg(__func__, xdr);
4974 return -EIO;
4975}
4976
4977static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
4978{
4979 __be32 *p;
4980 uint32_t delegation_type;
4981
4982 p = xdr_inline_decode(xdr, 4);
4983 if (unlikely(!p))
4984 goto out_overflow;
4985 delegation_type = be32_to_cpup(p);
4986 res->delegation_type = 0;
4987 switch (delegation_type) {
4988 case NFS4_OPEN_DELEGATE_NONE:
4989 return 0;
4990 case NFS4_OPEN_DELEGATE_READ:
4991 case NFS4_OPEN_DELEGATE_WRITE:
4992 return decode_rw_delegation(xdr, delegation_type, res);
4993 case NFS4_OPEN_DELEGATE_NONE_EXT:
4994 return decode_no_delegation(xdr, res);
4995 }
4996 return -EIO;
4997out_overflow:
4998 print_overflow_msg(__func__, xdr);
4999 return -EIO;
5000}
5001
4976static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 5002static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
4977{ 5003{
4978 __be32 *p; 5004 __be32 *p;
@@ -6567,6 +6593,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6567 int status; 6593 int status;
6568 6594
6569 status = decode_compound_hdr(xdr, &hdr); 6595 status = decode_compound_hdr(xdr, &hdr);
6596 res->op_status = hdr.status;
6570 if (status) 6597 if (status)
6571 goto out; 6598 goto out;
6572 status = decode_sequence(xdr, &res->seq_res, rqstp); 6599 status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6592,6 +6619,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6592 int status; 6619 int status;
6593 6620
6594 status = decode_compound_hdr(xdr, &hdr); 6621 status = decode_compound_hdr(xdr, &hdr);
6622 res->op_status = hdr.status;
6595 if (status) 6623 if (status)
6596 goto out; 6624 goto out;
6597 status = decode_sequence(xdr, &res->seq_res, rqstp); 6625 status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6621,6 +6649,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6621 int status; 6649 int status;
6622 6650
6623 status = decode_compound_hdr(xdr, &hdr); 6651 status = decode_compound_hdr(xdr, &hdr);
6652 res->op_status = hdr.status;
6624 if (status) 6653 if (status)
6625 goto out; 6654 goto out;
6626 status = decode_sequence(xdr, &res->seq_res, rqstp); 6655 status = decode_sequence(xdr, &res->seq_res, rqstp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index cd3c910d2d12..9bc9f04fb7f6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -261,11 +261,11 @@ static int __init root_nfs_data(char *cmdline)
261 */ 261 */
262 len = snprintf(nfs_export_path, sizeof(nfs_export_path), 262 len = snprintf(nfs_export_path, sizeof(nfs_export_path),
263 tmp, utsname()->nodename); 263 tmp, utsname()->nodename);
264 if (len > (int)sizeof(nfs_export_path)) 264 if (len >= (int)sizeof(nfs_export_path))
265 goto out_devnametoolong; 265 goto out_devnametoolong;
266 len = snprintf(nfs_root_device, sizeof(nfs_root_device), 266 len = snprintf(nfs_root_device, sizeof(nfs_root_device),
267 "%pI4:%s", &servaddr, nfs_export_path); 267 "%pI4:%s", &servaddr, nfs_export_path);
268 if (len > (int)sizeof(nfs_root_device)) 268 if (len >= (int)sizeof(nfs_root_device))
269 goto out_devnametoolong; 269 goto out_devnametoolong;
270 270
271 retval = 0; 271 retval = 0;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9e5bc42180e4..24e1d7403c0b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
537static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, 537static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
538 struct nfs_page *prev, struct nfs_page *req) 538 struct nfs_page *prev, struct nfs_page *req)
539{ 539{
540 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
540 unsigned int size; 541 unsigned int size;
541 542
542 size = pnfs_generic_pg_test(pgio, prev, req); 543 size = pnfs_generic_pg_test(pgio, prev, req);
543 544
544 if (!size || pgio->pg_count + req->wb_bytes > 545 if (!size || mirror->pg_count + req->wb_bytes >
545 (unsigned long)pgio->pg_layout_private) 546 (unsigned long)pgio->pg_layout_private)
546 return 0; 547 return 0;
547 548
@@ -607,12 +608,14 @@ static const struct nfs_pageio_ops objio_pg_read_ops = {
607 .pg_init = objio_init_read, 608 .pg_init = objio_init_read,
608 .pg_test = objio_pg_test, 609 .pg_test = objio_pg_test,
609 .pg_doio = pnfs_generic_pg_readpages, 610 .pg_doio = pnfs_generic_pg_readpages,
611 .pg_cleanup = pnfs_generic_pg_cleanup,
610}; 612};
611 613
612static const struct nfs_pageio_ops objio_pg_write_ops = { 614static const struct nfs_pageio_ops objio_pg_write_ops = {
613 .pg_init = objio_init_write, 615 .pg_init = objio_init_write,
614 .pg_test = objio_pg_test, 616 .pg_test = objio_pg_test,
615 .pg_doio = pnfs_generic_pg_writepages, 617 .pg_doio = pnfs_generic_pg_writepages,
618 .pg_cleanup = pnfs_generic_pg_cleanup,
616}; 619};
617 620
618static struct pnfs_layoutdriver_type objlayout_type = { 621static struct pnfs_layoutdriver_type objlayout_type = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29c7f33c9cf1..d57190a0d533 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -42,21 +42,35 @@ static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
42 return p->pagevec != NULL; 42 return p->pagevec != NULL;
43} 43}
44 44
45struct nfs_pgio_mirror *
46nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
47{
48 return nfs_pgio_has_mirroring(desc) ?
49 &desc->pg_mirrors[desc->pg_mirror_idx] :
50 &desc->pg_mirrors[0];
51}
52EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
53
45void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, 54void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
46 struct nfs_pgio_header *hdr, 55 struct nfs_pgio_header *hdr,
47 void (*release)(struct nfs_pgio_header *hdr)) 56 void (*release)(struct nfs_pgio_header *hdr))
48{ 57{
49 hdr->req = nfs_list_entry(desc->pg_list.next); 58 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
59
60
61 hdr->req = nfs_list_entry(mirror->pg_list.next);
50 hdr->inode = desc->pg_inode; 62 hdr->inode = desc->pg_inode;
51 hdr->cred = hdr->req->wb_context->cred; 63 hdr->cred = hdr->req->wb_context->cred;
52 hdr->io_start = req_offset(hdr->req); 64 hdr->io_start = req_offset(hdr->req);
53 hdr->good_bytes = desc->pg_count; 65 hdr->good_bytes = mirror->pg_count;
54 hdr->dreq = desc->pg_dreq; 66 hdr->dreq = desc->pg_dreq;
55 hdr->layout_private = desc->pg_layout_private; 67 hdr->layout_private = desc->pg_layout_private;
56 hdr->release = release; 68 hdr->release = release;
57 hdr->completion_ops = desc->pg_completion_ops; 69 hdr->completion_ops = desc->pg_completion_ops;
58 if (hdr->completion_ops->init_hdr) 70 if (hdr->completion_ops->init_hdr)
59 hdr->completion_ops->init_hdr(hdr); 71 hdr->completion_ops->init_hdr(hdr);
72
73 hdr->pgio_mirror_idx = desc->pg_mirror_idx;
60} 74}
61EXPORT_SYMBOL_GPL(nfs_pgheader_init); 75EXPORT_SYMBOL_GPL(nfs_pgheader_init);
62 76
@@ -480,7 +494,10 @@ nfs_wait_on_request(struct nfs_page *req)
480size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, 494size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
481 struct nfs_page *prev, struct nfs_page *req) 495 struct nfs_page *prev, struct nfs_page *req)
482{ 496{
483 if (desc->pg_count > desc->pg_bsize) { 497 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
498
499
500 if (mirror->pg_count > mirror->pg_bsize) {
484 /* should never happen */ 501 /* should never happen */
485 WARN_ON_ONCE(1); 502 WARN_ON_ONCE(1);
486 return 0; 503 return 0;
@@ -490,11 +507,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
490 * Limit the request size so that we can still allocate a page array 507 * Limit the request size so that we can still allocate a page array
491 * for it without upsetting the slab allocator. 508 * for it without upsetting the slab allocator.
492 */ 509 */
493 if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) * 510 if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
494 sizeof(struct page) > PAGE_SIZE) 511 sizeof(struct page) > PAGE_SIZE)
495 return 0; 512 return 0;
496 513
497 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); 514 return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
498} 515}
499EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 516EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
500 517
@@ -597,13 +614,14 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
597} 614}
598 615
599int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, 616int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
617 struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
600 const struct rpc_call_ops *call_ops, int how, int flags) 618 const struct rpc_call_ops *call_ops, int how, int flags)
601{ 619{
602 struct rpc_task *task; 620 struct rpc_task *task;
603 struct rpc_message msg = { 621 struct rpc_message msg = {
604 .rpc_argp = &hdr->args, 622 .rpc_argp = &hdr->args,
605 .rpc_resp = &hdr->res, 623 .rpc_resp = &hdr->res,
606 .rpc_cred = hdr->cred, 624 .rpc_cred = cred,
607 }; 625 };
608 struct rpc_task_setup task_setup_data = { 626 struct rpc_task_setup task_setup_data = {
609 .rpc_client = clnt, 627 .rpc_client = clnt,
@@ -616,7 +634,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
616 }; 634 };
617 int ret = 0; 635 int ret = 0;
618 636
619 hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how); 637 hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
620 638
621 dprintk("NFS: %5u initiated pgio call " 639 dprintk("NFS: %5u initiated pgio call "
622 "(req %s/%llu, %u bytes @ offset %llu)\n", 640 "(req %s/%llu, %u bytes @ offset %llu)\n",
@@ -650,10 +668,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
650static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, 668static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
651 struct nfs_pgio_header *hdr) 669 struct nfs_pgio_header *hdr)
652{ 670{
671 struct nfs_pgio_mirror *mirror;
672 u32 midx;
673
653 set_bit(NFS_IOHDR_REDO, &hdr->flags); 674 set_bit(NFS_IOHDR_REDO, &hdr->flags);
654 nfs_pgio_data_destroy(hdr); 675 nfs_pgio_data_destroy(hdr);
655 hdr->completion_ops->completion(hdr); 676 hdr->completion_ops->completion(hdr);
656 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 677 /* TODO: Make sure it's right to clean up all mirrors here
678 * and not just hdr->pgio_mirror_idx */
679 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
680 mirror = &desc->pg_mirrors[midx];
681 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
682 }
657 return -ENOMEM; 683 return -ENOMEM;
658} 684}
659 685
@@ -670,6 +696,17 @@ static void nfs_pgio_release(void *calldata)
670 hdr->completion_ops->completion(hdr); 696 hdr->completion_ops->completion(hdr);
671} 697}
672 698
699static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
700 unsigned int bsize)
701{
702 INIT_LIST_HEAD(&mirror->pg_list);
703 mirror->pg_bytes_written = 0;
704 mirror->pg_count = 0;
705 mirror->pg_bsize = bsize;
706 mirror->pg_base = 0;
707 mirror->pg_recoalesce = 0;
708}
709
673/** 710/**
674 * nfs_pageio_init - initialise a page io descriptor 711 * nfs_pageio_init - initialise a page io descriptor
675 * @desc: pointer to descriptor 712 * @desc: pointer to descriptor
@@ -686,13 +723,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
686 size_t bsize, 723 size_t bsize,
687 int io_flags) 724 int io_flags)
688{ 725{
689 INIT_LIST_HEAD(&desc->pg_list); 726 struct nfs_pgio_mirror *new;
690 desc->pg_bytes_written = 0; 727 int i;
691 desc->pg_count = 0; 728
692 desc->pg_bsize = bsize;
693 desc->pg_base = 0;
694 desc->pg_moreio = 0; 729 desc->pg_moreio = 0;
695 desc->pg_recoalesce = 0;
696 desc->pg_inode = inode; 730 desc->pg_inode = inode;
697 desc->pg_ops = pg_ops; 731 desc->pg_ops = pg_ops;
698 desc->pg_completion_ops = compl_ops; 732 desc->pg_completion_ops = compl_ops;
@@ -702,6 +736,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
702 desc->pg_lseg = NULL; 736 desc->pg_lseg = NULL;
703 desc->pg_dreq = NULL; 737 desc->pg_dreq = NULL;
704 desc->pg_layout_private = NULL; 738 desc->pg_layout_private = NULL;
739 desc->pg_bsize = bsize;
740
741 desc->pg_mirror_count = 1;
742 desc->pg_mirror_idx = 0;
743
744 if (pg_ops->pg_get_mirror_count) {
745 /* until we have a request, we don't have an lseg and no
746 * idea how many mirrors there will be */
747 new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
748 sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
749 desc->pg_mirrors_dynamic = new;
750 desc->pg_mirrors = new;
751
752 for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
753 nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
754 } else {
755 desc->pg_mirrors_dynamic = NULL;
756 desc->pg_mirrors = desc->pg_mirrors_static;
757 nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
758 }
705} 759}
706EXPORT_SYMBOL_GPL(nfs_pageio_init); 760EXPORT_SYMBOL_GPL(nfs_pageio_init);
707 761
@@ -737,14 +791,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
737int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, 791int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
738 struct nfs_pgio_header *hdr) 792 struct nfs_pgio_header *hdr)
739{ 793{
794 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
795
740 struct nfs_page *req; 796 struct nfs_page *req;
741 struct page **pages, 797 struct page **pages,
742 *last_page; 798 *last_page;
743 struct list_head *head = &desc->pg_list; 799 struct list_head *head = &mirror->pg_list;
744 struct nfs_commit_info cinfo; 800 struct nfs_commit_info cinfo;
745 unsigned int pagecount, pageused; 801 unsigned int pagecount, pageused;
746 802
747 pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count); 803 pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
748 if (!nfs_pgarray_set(&hdr->page_array, pagecount)) 804 if (!nfs_pgarray_set(&hdr->page_array, pagecount))
749 return nfs_pgio_error(desc, hdr); 805 return nfs_pgio_error(desc, hdr);
750 806
@@ -772,7 +828,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
772 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 828 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
773 829
774 /* Set up the argument struct */ 830 /* Set up the argument struct */
775 nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo); 831 nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
776 desc->pg_rpc_callops = &nfs_pgio_common_ops; 832 desc->pg_rpc_callops = &nfs_pgio_common_ops;
777 return 0; 833 return 0;
778} 834}
@@ -780,23 +836,74 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
780 836
781static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) 837static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
782{ 838{
839 struct nfs_pgio_mirror *mirror;
783 struct nfs_pgio_header *hdr; 840 struct nfs_pgio_header *hdr;
784 int ret; 841 int ret;
785 842
843 mirror = nfs_pgio_current_mirror(desc);
844
786 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 845 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
787 if (!hdr) { 846 if (!hdr) {
788 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 847 /* TODO: make sure this is right with mirroring - or
848 * should it back out all mirrors? */
849 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
789 return -ENOMEM; 850 return -ENOMEM;
790 } 851 }
791 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); 852 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
792 ret = nfs_generic_pgio(desc, hdr); 853 ret = nfs_generic_pgio(desc, hdr);
793 if (ret == 0) 854 if (ret == 0)
794 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), 855 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
795 hdr, desc->pg_rpc_callops, 856 hdr,
857 hdr->cred,
858 NFS_PROTO(hdr->inode),
859 desc->pg_rpc_callops,
796 desc->pg_ioflags, 0); 860 desc->pg_ioflags, 0);
797 return ret; 861 return ret;
798} 862}
799 863
864/*
865 * nfs_pageio_setup_mirroring - determine if mirroring is to be used
866 * by calling the pg_get_mirror_count op
867 */
868static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
869 struct nfs_page *req)
870{
871 int mirror_count = 1;
872
873 if (!pgio->pg_ops->pg_get_mirror_count)
874 return 0;
875
876 mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
877
878 if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
879 return -EINVAL;
880
881 if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
882 return -EINVAL;
883
884 pgio->pg_mirror_count = mirror_count;
885
886 return 0;
887}
888
889/*
890 * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
891 */
892void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
893{
894 pgio->pg_mirror_count = 1;
895 pgio->pg_mirror_idx = 0;
896}
897
898static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
899{
900 pgio->pg_mirror_count = 1;
901 pgio->pg_mirror_idx = 0;
902 pgio->pg_mirrors = pgio->pg_mirrors_static;
903 kfree(pgio->pg_mirrors_dynamic);
904 pgio->pg_mirrors_dynamic = NULL;
905}
906
800static bool nfs_match_open_context(const struct nfs_open_context *ctx1, 907static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
801 const struct nfs_open_context *ctx2) 908 const struct nfs_open_context *ctx2)
802{ 909{
@@ -867,19 +974,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
867static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, 974static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
868 struct nfs_page *req) 975 struct nfs_page *req)
869{ 976{
977 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
978
870 struct nfs_page *prev = NULL; 979 struct nfs_page *prev = NULL;
871 if (desc->pg_count != 0) { 980
872 prev = nfs_list_entry(desc->pg_list.prev); 981 if (mirror->pg_count != 0) {
982 prev = nfs_list_entry(mirror->pg_list.prev);
873 } else { 983 } else {
874 if (desc->pg_ops->pg_init) 984 if (desc->pg_ops->pg_init)
875 desc->pg_ops->pg_init(desc, req); 985 desc->pg_ops->pg_init(desc, req);
876 desc->pg_base = req->wb_pgbase; 986 mirror->pg_base = req->wb_pgbase;
877 } 987 }
878 if (!nfs_can_coalesce_requests(prev, req, desc)) 988 if (!nfs_can_coalesce_requests(prev, req, desc))
879 return 0; 989 return 0;
880 nfs_list_remove_request(req); 990 nfs_list_remove_request(req);
881 nfs_list_add_request(req, &desc->pg_list); 991 nfs_list_add_request(req, &mirror->pg_list);
882 desc->pg_count += req->wb_bytes; 992 mirror->pg_count += req->wb_bytes;
883 return 1; 993 return 1;
884} 994}
885 995
@@ -888,16 +998,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
888 */ 998 */
889static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 999static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
890{ 1000{
891 if (!list_empty(&desc->pg_list)) { 1001 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1002
1003
1004 if (!list_empty(&mirror->pg_list)) {
892 int error = desc->pg_ops->pg_doio(desc); 1005 int error = desc->pg_ops->pg_doio(desc);
893 if (error < 0) 1006 if (error < 0)
894 desc->pg_error = error; 1007 desc->pg_error = error;
895 else 1008 else
896 desc->pg_bytes_written += desc->pg_count; 1009 mirror->pg_bytes_written += mirror->pg_count;
897 } 1010 }
898 if (list_empty(&desc->pg_list)) { 1011 if (list_empty(&mirror->pg_list)) {
899 desc->pg_count = 0; 1012 mirror->pg_count = 0;
900 desc->pg_base = 0; 1013 mirror->pg_base = 0;
901 } 1014 }
902} 1015}
903 1016
@@ -915,6 +1028,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
915static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 1028static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
916 struct nfs_page *req) 1029 struct nfs_page *req)
917{ 1030{
1031 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1032
918 struct nfs_page *subreq; 1033 struct nfs_page *subreq;
919 unsigned int bytes_left = 0; 1034 unsigned int bytes_left = 0;
920 unsigned int offset, pgbase; 1035 unsigned int offset, pgbase;
@@ -938,7 +1053,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
938 nfs_pageio_doio(desc); 1053 nfs_pageio_doio(desc);
939 if (desc->pg_error < 0) 1054 if (desc->pg_error < 0)
940 return 0; 1055 return 0;
941 if (desc->pg_recoalesce) 1056 if (mirror->pg_recoalesce)
942 return 0; 1057 return 0;
943 /* retry add_request for this subreq */ 1058 /* retry add_request for this subreq */
944 nfs_page_group_lock(req, false); 1059 nfs_page_group_lock(req, false);
@@ -976,14 +1091,16 @@ err_ptr:
976 1091
977static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) 1092static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
978{ 1093{
1094 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
979 LIST_HEAD(head); 1095 LIST_HEAD(head);
980 1096
981 do { 1097 do {
982 list_splice_init(&desc->pg_list, &head); 1098 list_splice_init(&mirror->pg_list, &head);
983 desc->pg_bytes_written -= desc->pg_count; 1099 mirror->pg_bytes_written -= mirror->pg_count;
984 desc->pg_count = 0; 1100 mirror->pg_count = 0;
985 desc->pg_base = 0; 1101 mirror->pg_base = 0;
986 desc->pg_recoalesce = 0; 1102 mirror->pg_recoalesce = 0;
1103
987 desc->pg_moreio = 0; 1104 desc->pg_moreio = 0;
988 1105
989 while (!list_empty(&head)) { 1106 while (!list_empty(&head)) {
@@ -997,11 +1114,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
997 return 0; 1114 return 0;
998 break; 1115 break;
999 } 1116 }
1000 } while (desc->pg_recoalesce); 1117 } while (mirror->pg_recoalesce);
1001 return 1; 1118 return 1;
1002} 1119}
1003 1120
1004int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 1121static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
1005 struct nfs_page *req) 1122 struct nfs_page *req)
1006{ 1123{
1007 int ret; 1124 int ret;
@@ -1014,9 +1131,80 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1014 break; 1131 break;
1015 ret = nfs_do_recoalesce(desc); 1132 ret = nfs_do_recoalesce(desc);
1016 } while (ret); 1133 } while (ret);
1134
1017 return ret; 1135 return ret;
1018} 1136}
1019 1137
1138int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1139 struct nfs_page *req)
1140{
1141 u32 midx;
1142 unsigned int pgbase, offset, bytes;
1143 struct nfs_page *dupreq, *lastreq;
1144
1145 pgbase = req->wb_pgbase;
1146 offset = req->wb_offset;
1147 bytes = req->wb_bytes;
1148
1149 nfs_pageio_setup_mirroring(desc, req);
1150
1151 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
1152 if (midx) {
1153 nfs_page_group_lock(req, false);
1154
1155 /* find the last request */
1156 for (lastreq = req->wb_head;
1157 lastreq->wb_this_page != req->wb_head;
1158 lastreq = lastreq->wb_this_page)
1159 ;
1160
1161 dupreq = nfs_create_request(req->wb_context,
1162 req->wb_page, lastreq, pgbase, bytes);
1163
1164 if (IS_ERR(dupreq)) {
1165 nfs_page_group_unlock(req);
1166 return 0;
1167 }
1168
1169 nfs_lock_request(dupreq);
1170 nfs_page_group_unlock(req);
1171 dupreq->wb_offset = offset;
1172 dupreq->wb_index = req->wb_index;
1173 } else
1174 dupreq = req;
1175
1176 if (nfs_pgio_has_mirroring(desc))
1177 desc->pg_mirror_idx = midx;
1178 if (!nfs_pageio_add_request_mirror(desc, dupreq))
1179 return 0;
1180 }
1181
1182 return 1;
1183}
1184
1185/*
1186 * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
1187 * nfs_pageio_descriptor
1188 * @desc: pointer to io descriptor
1189 */
1190static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
1191 u32 mirror_idx)
1192{
1193 struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
1194 u32 restore_idx = desc->pg_mirror_idx;
1195
1196 if (nfs_pgio_has_mirroring(desc))
1197 desc->pg_mirror_idx = mirror_idx;
1198 for (;;) {
1199 nfs_pageio_doio(desc);
1200 if (!mirror->pg_recoalesce)
1201 break;
1202 if (!nfs_do_recoalesce(desc))
1203 break;
1204 }
1205 desc->pg_mirror_idx = restore_idx;
1206}
1207
1020/* 1208/*
1021 * nfs_pageio_resend - Transfer requests to new descriptor and resend 1209 * nfs_pageio_resend - Transfer requests to new descriptor and resend
1022 * @hdr - the pgio header to move request from 1210 * @hdr - the pgio header to move request from
@@ -1050,18 +1238,19 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
1050EXPORT_SYMBOL_GPL(nfs_pageio_resend); 1238EXPORT_SYMBOL_GPL(nfs_pageio_resend);
1051 1239
1052/** 1240/**
1053 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 1241 * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
1054 * @desc: pointer to io descriptor 1242 * @desc: pointer to io descriptor
1055 */ 1243 */
1056void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) 1244void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1057{ 1245{
1058 for (;;) { 1246 u32 midx;
1059 nfs_pageio_doio(desc); 1247
1060 if (!desc->pg_recoalesce) 1248 for (midx = 0; midx < desc->pg_mirror_count; midx++)
1061 break; 1249 nfs_pageio_complete_mirror(desc, midx);
1062 if (!nfs_do_recoalesce(desc)) 1250
1063 break; 1251 if (desc->pg_ops->pg_cleanup)
1064 } 1252 desc->pg_ops->pg_cleanup(desc);
1253 nfs_pageio_cleanup_mirroring(desc);
1065} 1254}
1066 1255
1067/** 1256/**
@@ -1077,10 +1266,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1077 */ 1266 */
1078void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) 1267void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
1079{ 1268{
1080 if (!list_empty(&desc->pg_list)) { 1269 struct nfs_pgio_mirror *mirror;
1081 struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); 1270 struct nfs_page *prev;
1082 if (index != prev->wb_index + 1) 1271 u32 midx;
1083 nfs_pageio_complete(desc); 1272
1273 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
1274 mirror = &desc->pg_mirrors[midx];
1275 if (!list_empty(&mirror->pg_list)) {
1276 prev = nfs_list_entry(mirror->pg_list.prev);
1277 if (index != prev->wb_index + 1)
1278 nfs_pageio_complete_mirror(desc, midx);
1279 }
1084 } 1280 }
1085} 1281}
1086 1282
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0a5dda4d85c2..4f802b02fbb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -34,6 +34,7 @@
34#include "pnfs.h" 34#include "pnfs.h"
35#include "iostat.h" 35#include "iostat.h"
36#include "nfs4trace.h" 36#include "nfs4trace.h"
37#include "delegation.h"
37 38
38#define NFSDBG_FACILITY NFSDBG_PNFS 39#define NFSDBG_FACILITY NFSDBG_PNFS
39#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 40#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
50 */ 51 */
51static LIST_HEAD(pnfs_modules_tbl); 52static LIST_HEAD(pnfs_modules_tbl);
52 53
54static int
55pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
56 enum pnfs_iomode iomode, bool sync);
57
53/* Return the registered pnfs layout driver module matching given id */ 58/* Return the registered pnfs layout driver module matching given id */
54static struct pnfs_layoutdriver_type * 59static struct pnfs_layoutdriver_type *
55find_pnfs_driver_locked(u32 id) 60find_pnfs_driver_locked(u32 id)
@@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
238 struct inode *inode = lo->plh_inode; 243 struct inode *inode = lo->plh_inode;
239 244
240 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 245 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
246 if (!list_empty(&lo->plh_segs))
247 WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
241 pnfs_detach_layout_hdr(lo); 248 pnfs_detach_layout_hdr(lo);
242 spin_unlock(&inode->i_lock); 249 spin_unlock(&inode->i_lock);
243 pnfs_free_layout_hdr(lo); 250 pnfs_free_layout_hdr(lo);
@@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
337 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 344 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
338} 345}
339 346
347/* Return true if layoutreturn is needed */
348static bool
349pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
350 struct pnfs_layout_segment *lseg)
351{
352 struct pnfs_layout_segment *s;
353
354 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
355 return false;
356
357 list_for_each_entry(s, &lo->plh_segs, pls_list)
358 if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
359 return false;
360
361 return true;
362}
363
364static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
365 struct pnfs_layout_hdr *lo, struct inode *inode)
366{
367 lo = lseg->pls_layout;
368 inode = lo->plh_inode;
369
370 spin_lock(&inode->i_lock);
371 if (pnfs_layout_need_return(lo, lseg)) {
372 nfs4_stateid stateid;
373 enum pnfs_iomode iomode;
374
375 stateid = lo->plh_stateid;
376 iomode = lo->plh_return_iomode;
377 /* decreased in pnfs_send_layoutreturn() */
378 lo->plh_block_lgets++;
379 lo->plh_return_iomode = 0;
380 spin_unlock(&inode->i_lock);
381 pnfs_get_layout_hdr(lo);
382
383 /* Send an async layoutreturn so we dont deadlock */
384 pnfs_send_layoutreturn(lo, stateid, iomode, false);
385 } else
386 spin_unlock(&inode->i_lock);
387}
388
340void 389void
341pnfs_put_lseg(struct pnfs_layout_segment *lseg) 390pnfs_put_lseg(struct pnfs_layout_segment *lseg)
342{ 391{
@@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
349 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 398 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
350 atomic_read(&lseg->pls_refcount), 399 atomic_read(&lseg->pls_refcount),
351 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 400 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
401
402 /* Handle the case where refcount != 1 */
403 if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
404 return;
405
352 lo = lseg->pls_layout; 406 lo = lseg->pls_layout;
353 inode = lo->plh_inode; 407 inode = lo->plh_inode;
408 /* Do we need a layoutreturn? */
409 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
410 pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
411
354 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 412 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
355 pnfs_get_layout_hdr(lo); 413 pnfs_get_layout_hdr(lo);
356 pnfs_layout_remove_lseg(lo, lseg); 414 pnfs_layout_remove_lseg(lo, lseg);
@@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
543 pnfs_get_layout_hdr(lo); 601 pnfs_get_layout_hdr(lo);
544 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 602 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
545 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 603 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
604 pnfs_clear_retry_layoutget(lo);
546 spin_unlock(&nfsi->vfs_inode.i_lock); 605 spin_unlock(&nfsi->vfs_inode.i_lock);
547 pnfs_free_lseg_list(&tmp_list); 606 pnfs_free_lseg_list(&tmp_list);
548 pnfs_put_layout_hdr(lo); 607 pnfs_put_layout_hdr(lo);
@@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
740 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 799 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
741} 800}
742 801
802static bool
803pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
804 struct pnfs_layout_range *range)
805{
806 return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
807 (lo->plh_return_iomode == IOMODE_ANY ||
808 lo->plh_return_iomode == range->iomode);
809}
810
743/* lget is set to 1 if called from inside send_layoutget call chain */ 811/* lget is set to 1 if called from inside send_layoutget call chain */
744static bool 812static bool
745pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) 813pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
814 struct pnfs_layout_range *range, int lget)
746{ 815{
747 return lo->plh_block_lgets || 816 return lo->plh_block_lgets ||
748 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 817 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
749 (list_empty(&lo->plh_segs) && 818 (list_empty(&lo->plh_segs) &&
750 (atomic_read(&lo->plh_outstanding) > lget)); 819 (atomic_read(&lo->plh_outstanding) > lget)) ||
820 pnfs_layout_returning(lo, range);
751} 821}
752 822
753int 823int
754pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 824pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
825 struct pnfs_layout_range *range,
755 struct nfs4_state *open_state) 826 struct nfs4_state *open_state)
756{ 827{
757 int status = 0; 828 int status = 0;
758 829
759 dprintk("--> %s\n", __func__); 830 dprintk("--> %s\n", __func__);
760 spin_lock(&lo->plh_inode->i_lock); 831 spin_lock(&lo->plh_inode->i_lock);
761 if (pnfs_layoutgets_blocked(lo, 1)) { 832 if (pnfs_layoutgets_blocked(lo, range, 1)) {
762 status = -EAGAIN; 833 status = -EAGAIN;
763 } else if (!nfs4_valid_open_stateid(open_state)) { 834 } else if (!nfs4_valid_open_stateid(open_state)) {
764 status = -EBADF; 835 status = -EBADF;
@@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
825 pnfs_layout_io_set_failed(lo, range->iomode); 896 pnfs_layout_io_set_failed(lo, range->iomode);
826 } 897 }
827 return NULL; 898 return NULL;
828 } 899 } else
900 pnfs_layout_clear_fail_bit(lo,
901 pnfs_iomode_to_fail_bit(range->iomode));
829 902
830 return lseg; 903 return lseg;
831} 904}
@@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
845 } 918 }
846} 919}
847 920
921void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
922{
923 clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
924 smp_mb__after_atomic();
925 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
926}
927
928static int
929pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
930 enum pnfs_iomode iomode, bool sync)
931{
932 struct inode *ino = lo->plh_inode;
933 struct nfs4_layoutreturn *lrp;
934 int status = 0;
935
936 lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
937 if (unlikely(lrp == NULL)) {
938 status = -ENOMEM;
939 spin_lock(&ino->i_lock);
940 lo->plh_block_lgets--;
941 pnfs_clear_layoutreturn_waitbit(lo);
942 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
943 spin_unlock(&ino->i_lock);
944 pnfs_put_layout_hdr(lo);
945 goto out;
946 }
947
948 lrp->args.stateid = stateid;
949 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
950 lrp->args.inode = ino;
951 lrp->args.range.iomode = iomode;
952 lrp->args.range.offset = 0;
953 lrp->args.range.length = NFS4_MAX_UINT64;
954 lrp->args.layout = lo;
955 lrp->clp = NFS_SERVER(ino)->nfs_client;
956 lrp->cred = lo->plh_lc_cred;
957
958 status = nfs4_proc_layoutreturn(lrp, sync);
959out:
960 dprintk("<-- %s status: %d\n", __func__, status);
961 return status;
962}
963
848/* 964/*
849 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 965 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
850 * when the layout segment list is empty. 966 * when the layout segment list is empty.
@@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino)
859 struct pnfs_layout_hdr *lo = NULL; 975 struct pnfs_layout_hdr *lo = NULL;
860 struct nfs_inode *nfsi = NFS_I(ino); 976 struct nfs_inode *nfsi = NFS_I(ino);
861 LIST_HEAD(tmp_list); 977 LIST_HEAD(tmp_list);
862 struct nfs4_layoutreturn *lrp;
863 nfs4_stateid stateid; 978 nfs4_stateid stateid;
864 int status = 0, empty; 979 int status = 0, empty;
865 980
@@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino)
901 spin_unlock(&ino->i_lock); 1016 spin_unlock(&ino->i_lock);
902 pnfs_free_lseg_list(&tmp_list); 1017 pnfs_free_lseg_list(&tmp_list);
903 1018
904 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 1019 status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
905 if (unlikely(lrp == NULL)) {
906 status = -ENOMEM;
907 spin_lock(&ino->i_lock);
908 lo->plh_block_lgets--;
909 spin_unlock(&ino->i_lock);
910 pnfs_put_layout_hdr(lo);
911 goto out;
912 }
913
914 lrp->args.stateid = stateid;
915 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
916 lrp->args.inode = ino;
917 lrp->args.layout = lo;
918 lrp->clp = NFS_SERVER(ino)->nfs_client;
919 lrp->cred = lo->plh_lc_cred;
920
921 status = nfs4_proc_layoutreturn(lrp);
922out: 1020out:
923 dprintk("<-- %s status: %d\n", __func__, status); 1021 dprintk("<-- %s status: %d\n", __func__, status);
924 return status; 1022 return status;
@@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode)
954 1052
955bool pnfs_roc(struct inode *ino) 1053bool pnfs_roc(struct inode *ino)
956{ 1054{
1055 struct nfs_inode *nfsi = NFS_I(ino);
1056 struct nfs_open_context *ctx;
1057 struct nfs4_state *state;
957 struct pnfs_layout_hdr *lo; 1058 struct pnfs_layout_hdr *lo;
958 struct pnfs_layout_segment *lseg, *tmp; 1059 struct pnfs_layout_segment *lseg, *tmp;
1060 nfs4_stateid stateid;
959 LIST_HEAD(tmp_list); 1061 LIST_HEAD(tmp_list);
960 bool found = false; 1062 bool found = false, layoutreturn = false;
961 1063
962 spin_lock(&ino->i_lock); 1064 spin_lock(&ino->i_lock);
963 lo = NFS_I(ino)->layout; 1065 lo = nfsi->layout;
964 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 1066 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
965 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 1067 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
966 goto out_nolayout; 1068 goto out_noroc;
1069
1070 /* Don't return layout if we hold a delegation */
1071 if (nfs4_check_delegation(ino, FMODE_READ))
1072 goto out_noroc;
1073
1074 list_for_each_entry(ctx, &nfsi->open_files, list) {
1075 state = ctx->state;
1076 /* Don't return layout if there is open file state */
1077 if (state != NULL && state->state != 0)
1078 goto out_noroc;
1079 }
1080
1081 pnfs_clear_retry_layoutget(lo);
967 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1082 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
968 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 1083 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
969 mark_lseg_invalid(lseg, &tmp_list); 1084 mark_lseg_invalid(lseg, &tmp_list);
970 found = true; 1085 found = true;
971 } 1086 }
972 if (!found) 1087 if (!found)
973 goto out_nolayout; 1088 goto out_noroc;
974 lo->plh_block_lgets++; 1089 lo->plh_block_lgets++;
975 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 1090 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
976 spin_unlock(&ino->i_lock); 1091 spin_unlock(&ino->i_lock);
977 pnfs_free_lseg_list(&tmp_list); 1092 pnfs_free_lseg_list(&tmp_list);
978 return true; 1093 return true;
979 1094
980out_nolayout: 1095out_noroc:
1096 if (lo) {
1097 stateid = lo->plh_stateid;
1098 layoutreturn =
1099 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1100 &lo->plh_flags);
1101 if (layoutreturn) {
1102 lo->plh_block_lgets++;
1103 pnfs_get_layout_hdr(lo);
1104 }
1105 }
981 spin_unlock(&ino->i_lock); 1106 spin_unlock(&ino->i_lock);
1107 if (layoutreturn)
1108 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
982 return false; 1109 return false;
983} 1110}
984 1111
@@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1013 struct nfs_inode *nfsi = NFS_I(ino); 1140 struct nfs_inode *nfsi = NFS_I(ino);
1014 struct pnfs_layout_hdr *lo; 1141 struct pnfs_layout_hdr *lo;
1015 struct pnfs_layout_segment *lseg; 1142 struct pnfs_layout_segment *lseg;
1143 nfs4_stateid stateid;
1016 u32 current_seqid; 1144 u32 current_seqid;
1017 bool found = false; 1145 bool found = false, layoutreturn = false;
1018 1146
1019 spin_lock(&ino->i_lock); 1147 spin_lock(&ino->i_lock);
1020 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 1148 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
@@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1031 */ 1159 */
1032 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 1160 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1033out: 1161out:
1162 if (!found) {
1163 stateid = lo->plh_stateid;
1164 layoutreturn =
1165 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1166 &lo->plh_flags);
1167 if (layoutreturn) {
1168 lo->plh_block_lgets++;
1169 pnfs_get_layout_hdr(lo);
1170 }
1171 }
1034 spin_unlock(&ino->i_lock); 1172 spin_unlock(&ino->i_lock);
1173 if (layoutreturn) {
1174 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1175 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
1176 }
1035 return found; 1177 return found;
1036} 1178}
1037 1179
@@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1178 1320
1179 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1321 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1180 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1322 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1323 !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1181 pnfs_lseg_range_match(&lseg->pls_range, range)) { 1324 pnfs_lseg_range_match(&lseg->pls_range, range)) {
1182 ret = pnfs_get_lseg(lseg); 1325 ret = pnfs_get_lseg(lseg);
1183 break; 1326 break;
@@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1266 return ret; 1409 return ret;
1267} 1410}
1268 1411
1412/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
1413static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
1414{
1415 if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
1416 return 1;
1417 return nfs_wait_bit_killable(key);
1418}
1419
1420static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1421{
1422 /*
1423 * send layoutcommit as it can hold up layoutreturn due to lseg
1424 * reference
1425 */
1426 pnfs_layoutcommit_inode(lo->plh_inode, false);
1427 return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1428 pnfs_layoutget_retry_bit_wait,
1429 TASK_UNINTERRUPTIBLE);
1430}
1431
1432static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1433{
1434 unsigned long *bitlock = &lo->plh_flags;
1435
1436 clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1437 smp_mb__after_atomic();
1438 wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1439}
1440
1269/* 1441/*
1270 * Layout segment is retreived from the server if not cached. 1442 * Layout segment is retreived from the server if not cached.
1271 * The appropriate layout segment is referenced and returned to the caller. 1443 * The appropriate layout segment is referenced and returned to the caller.
@@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino,
1296 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1468 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1297 goto out; 1469 goto out;
1298 1470
1471lookup_again:
1472 first = false;
1299 spin_lock(&ino->i_lock); 1473 spin_lock(&ino->i_lock);
1300 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1474 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1301 if (lo == NULL) { 1475 if (lo == NULL) {
@@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino,
1310 } 1484 }
1311 1485
1312 /* if LAYOUTGET already failed once we don't try again */ 1486 /* if LAYOUTGET already failed once we don't try again */
1313 if (pnfs_layout_io_test_failed(lo, iomode)) 1487 if (pnfs_layout_io_test_failed(lo, iomode) &&
1488 !pnfs_should_retry_layoutget(lo))
1314 goto out_unlock; 1489 goto out_unlock;
1315 1490
1316 /* Check to see if the layout for the given range already exists */ 1491 first = list_empty(&lo->plh_segs);
1317 lseg = pnfs_find_lseg(lo, &arg); 1492 if (first) {
1318 if (lseg) 1493 /* The first layoutget for the file. Need to serialize per
1319 goto out_unlock; 1494 * RFC 5661 Errata 3208.
1495 */
1496 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1497 &lo->plh_flags)) {
1498 spin_unlock(&ino->i_lock);
1499 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1500 TASK_UNINTERRUPTIBLE);
1501 pnfs_put_layout_hdr(lo);
1502 goto lookup_again;
1503 }
1504 } else {
1505 /* Check to see if the layout for the given range
1506 * already exists
1507 */
1508 lseg = pnfs_find_lseg(lo, &arg);
1509 if (lseg)
1510 goto out_unlock;
1511 }
1512
1513 /*
1514 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1515 * for LAYOUTRETURN even if first is true.
1516 */
1517 if (!lseg && pnfs_should_retry_layoutget(lo) &&
1518 test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1519 spin_unlock(&ino->i_lock);
1520 dprintk("%s wait for layoutreturn\n", __func__);
1521 if (pnfs_prepare_to_retry_layoutget(lo)) {
1522 if (first)
1523 pnfs_clear_first_layoutget(lo);
1524 pnfs_put_layout_hdr(lo);
1525 dprintk("%s retrying\n", __func__);
1526 goto lookup_again;
1527 }
1528 goto out_put_layout_hdr;
1529 }
1320 1530
1321 if (pnfs_layoutgets_blocked(lo, 0)) 1531 if (pnfs_layoutgets_blocked(lo, &arg, 0))
1322 goto out_unlock; 1532 goto out_unlock;
1323 atomic_inc(&lo->plh_outstanding); 1533 atomic_inc(&lo->plh_outstanding);
1324
1325 first = list_empty(&lo->plh_layouts) ? true : false;
1326 spin_unlock(&ino->i_lock); 1534 spin_unlock(&ino->i_lock);
1327 1535
1328 if (first) { 1536 if (list_empty(&lo->plh_layouts)) {
1329 /* The lo must be on the clp list if there is any 1537 /* The lo must be on the clp list if there is any
1330 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1538 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1331 */ 1539 */
1332 spin_lock(&clp->cl_lock); 1540 spin_lock(&clp->cl_lock);
1333 list_add_tail(&lo->plh_layouts, &server->layouts); 1541 if (list_empty(&lo->plh_layouts))
1542 list_add_tail(&lo->plh_layouts, &server->layouts);
1334 spin_unlock(&clp->cl_lock); 1543 spin_unlock(&clp->cl_lock);
1335 } 1544 }
1336 1545
@@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino,
1343 arg.length = PAGE_CACHE_ALIGN(arg.length); 1552 arg.length = PAGE_CACHE_ALIGN(arg.length);
1344 1553
1345 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1554 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1555 pnfs_clear_retry_layoutget(lo);
1346 atomic_dec(&lo->plh_outstanding); 1556 atomic_dec(&lo->plh_outstanding);
1347out_put_layout_hdr: 1557out_put_layout_hdr:
1558 if (first)
1559 pnfs_clear_first_layoutget(lo);
1348 pnfs_put_layout_hdr(lo); 1560 pnfs_put_layout_hdr(lo);
1349out: 1561out:
1350 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 1562 dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1393 goto out_forget_reply; 1605 goto out_forget_reply;
1394 } 1606 }
1395 1607
1396 if (pnfs_layoutgets_blocked(lo, 1)) { 1608 if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
1397 dprintk("%s forget reply due to state\n", __func__); 1609 dprintk("%s forget reply due to state\n", __func__);
1398 goto out_forget_reply; 1610 goto out_forget_reply;
1399 } 1611 }
@@ -1440,24 +1652,79 @@ out_forget_reply:
1440 goto out; 1652 goto out;
1441} 1653}
1442 1654
1655static void
1656pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1657 struct list_head *tmp_list,
1658 struct pnfs_layout_range *return_range)
1659{
1660 struct pnfs_layout_segment *lseg, *next;
1661
1662 dprintk("%s:Begin lo %p\n", __func__, lo);
1663
1664 if (list_empty(&lo->plh_segs))
1665 return;
1666
1667 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1668 if (should_free_lseg(&lseg->pls_range, return_range)) {
1669 dprintk("%s: marking lseg %p iomode %d "
1670 "offset %llu length %llu\n", __func__,
1671 lseg, lseg->pls_range.iomode,
1672 lseg->pls_range.offset,
1673 lseg->pls_range.length);
1674 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1675 mark_lseg_invalid(lseg, tmp_list);
1676 }
1677}
1678
1679void pnfs_error_mark_layout_for_return(struct inode *inode,
1680 struct pnfs_layout_segment *lseg)
1681{
1682 struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1683 int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
1684 struct pnfs_layout_range range = {
1685 .iomode = lseg->pls_range.iomode,
1686 .offset = 0,
1687 .length = NFS4_MAX_UINT64,
1688 };
1689 LIST_HEAD(free_me);
1690
1691 spin_lock(&inode->i_lock);
1692 /* set failure bit so that pnfs path will be retried later */
1693 pnfs_layout_set_fail_bit(lo, iomode);
1694 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1695 if (lo->plh_return_iomode == 0)
1696 lo->plh_return_iomode = range.iomode;
1697 else if (lo->plh_return_iomode != range.iomode)
1698 lo->plh_return_iomode = IOMODE_ANY;
1699 /*
1700 * mark all matching lsegs so that we are sure to have no live
1701 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1702 * for how it works.
1703 */
1704 pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
1705 spin_unlock(&inode->i_lock);
1706 pnfs_free_lseg_list(&free_me);
1707}
1708EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1709
1443void 1710void
1444pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1711pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1445{ 1712{
1446 u64 rd_size = req->wb_bytes; 1713 u64 rd_size = req->wb_bytes;
1447 1714
1448 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1715 if (pgio->pg_lseg == NULL) {
1449 1716 if (pgio->pg_dreq == NULL)
1450 if (pgio->pg_dreq == NULL) 1717 rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1451 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 1718 else
1452 else 1719 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1453 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 1720
1454 1721 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1455 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1722 req->wb_context,
1456 req->wb_context, 1723 req_offset(req),
1457 req_offset(req), 1724 rd_size,
1458 rd_size, 1725 IOMODE_READ,
1459 IOMODE_READ, 1726 GFP_KERNEL);
1460 GFP_KERNEL); 1727 }
1461 /* If no lseg, fall back to read through mds */ 1728 /* If no lseg, fall back to read through mds */
1462 if (pgio->pg_lseg == NULL) 1729 if (pgio->pg_lseg == NULL)
1463 nfs_pageio_reset_read_mds(pgio); 1730 nfs_pageio_reset_read_mds(pgio);
@@ -1469,27 +1736,36 @@ void
1469pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1736pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1470 struct nfs_page *req, u64 wb_size) 1737 struct nfs_page *req, u64 wb_size)
1471{ 1738{
1472 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1739 if (pgio->pg_lseg == NULL)
1473 1740 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1474 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1741 req->wb_context,
1475 req->wb_context, 1742 req_offset(req),
1476 req_offset(req), 1743 wb_size,
1477 wb_size, 1744 IOMODE_RW,
1478 IOMODE_RW, 1745 GFP_NOFS);
1479 GFP_NOFS);
1480 /* If no lseg, fall back to write through mds */ 1746 /* If no lseg, fall back to write through mds */
1481 if (pgio->pg_lseg == NULL) 1747 if (pgio->pg_lseg == NULL)
1482 nfs_pageio_reset_write_mds(pgio); 1748 nfs_pageio_reset_write_mds(pgio);
1483} 1749}
1484EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1750EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1485 1751
1752void
1753pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
1754{
1755 if (desc->pg_lseg) {
1756 pnfs_put_lseg(desc->pg_lseg);
1757 desc->pg_lseg = NULL;
1758 }
1759}
1760EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
1761
1486/* 1762/*
1487 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number 1763 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1488 * of bytes (maximum @req->wb_bytes) that can be coalesced. 1764 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1489 */ 1765 */
1490size_t 1766size_t
1491pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1767pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
1492 struct nfs_page *req) 1768 struct nfs_page *prev, struct nfs_page *req)
1493{ 1769{
1494 unsigned int size; 1770 unsigned int size;
1495 u64 seg_end, req_start, seg_left; 1771 u64 seg_end, req_start, seg_left;
@@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1513 seg_end = end_offset(pgio->pg_lseg->pls_range.offset, 1789 seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
1514 pgio->pg_lseg->pls_range.length); 1790 pgio->pg_lseg->pls_range.length);
1515 req_start = req_offset(req); 1791 req_start = req_offset(req);
1516 WARN_ON_ONCE(req_start > seg_end); 1792 WARN_ON_ONCE(req_start >= seg_end);
1517 /* start of request is past the last byte of this segment */ 1793 /* start of request is past the last byte of this segment */
1518 if (req_start >= seg_end) 1794 if (req_start >= seg_end) {
1795 /* reference the new lseg */
1796 if (pgio->pg_ops->pg_cleanup)
1797 pgio->pg_ops->pg_cleanup(pgio);
1798 if (pgio->pg_ops->pg_init)
1799 pgio->pg_ops->pg_init(pgio, req);
1519 return 0; 1800 return 0;
1801 }
1520 1802
1521 /* adjust 'size' iff there are fewer bytes left in the 1803 /* adjust 'size' iff there are fewer bytes left in the
1522 * segment than what nfs_generic_pg_test returned */ 1804 * segment than what nfs_generic_pg_test returned */
@@ -1571,10 +1853,12 @@ static void
1571pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1853pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1572 struct nfs_pgio_header *hdr) 1854 struct nfs_pgio_header *hdr)
1573{ 1855{
1856 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1857
1574 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1858 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1575 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1859 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1576 nfs_pageio_reset_write_mds(desc); 1860 nfs_pageio_reset_write_mds(desc);
1577 desc->pg_recoalesce = 1; 1861 mirror->pg_recoalesce = 1;
1578 } 1862 }
1579 nfs_pgio_data_destroy(hdr); 1863 nfs_pgio_data_destroy(hdr);
1580} 1864}
@@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
1608 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1892 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1609 enum pnfs_try_status trypnfs; 1893 enum pnfs_try_status trypnfs;
1610 1894
1611 desc->pg_lseg = NULL;
1612 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); 1895 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
1613 if (trypnfs == PNFS_NOT_ATTEMPTED) 1896 if (trypnfs == PNFS_NOT_ATTEMPTED)
1614 pnfs_write_through_mds(desc, hdr); 1897 pnfs_write_through_mds(desc, hdr);
1615 pnfs_put_lseg(lseg);
1616} 1898}
1617 1899
1618static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1900static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1625int 1907int
1626pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1908pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1627{ 1909{
1910 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1911
1628 struct nfs_pgio_header *hdr; 1912 struct nfs_pgio_header *hdr;
1629 int ret; 1913 int ret;
1630 1914
1631 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 1915 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1632 if (!hdr) { 1916 if (!hdr) {
1633 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1917 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1634 pnfs_put_lseg(desc->pg_lseg);
1635 desc->pg_lseg = NULL;
1636 return -ENOMEM; 1918 return -ENOMEM;
1637 } 1919 }
1638 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1920 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1921
1639 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1922 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1640 ret = nfs_generic_pgio(desc, hdr); 1923 ret = nfs_generic_pgio(desc, hdr);
1641 if (ret != 0) { 1924 if (!ret)
1642 pnfs_put_lseg(desc->pg_lseg);
1643 desc->pg_lseg = NULL;
1644 } else
1645 pnfs_do_write(desc, hdr, desc->pg_ioflags); 1925 pnfs_do_write(desc, hdr, desc->pg_ioflags);
1926
1646 return ret; 1927 return ret;
1647} 1928}
1648EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1929EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1687,10 +1968,12 @@ static void
1687pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1968pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1688 struct nfs_pgio_header *hdr) 1969 struct nfs_pgio_header *hdr)
1689{ 1970{
1971 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1972
1690 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1973 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1691 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1974 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1692 nfs_pageio_reset_read_mds(desc); 1975 nfs_pageio_reset_read_mds(desc);
1693 desc->pg_recoalesce = 1; 1976 mirror->pg_recoalesce = 1;
1694 } 1977 }
1695 nfs_pgio_data_destroy(hdr); 1978 nfs_pgio_data_destroy(hdr);
1696} 1979}
@@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
1719 return trypnfs; 2002 return trypnfs;
1720} 2003}
1721 2004
2005/* Resend all requests through pnfs. */
2006int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2007{
2008 struct nfs_pageio_descriptor pgio;
2009
2010 nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
2011 return nfs_pageio_resend(&pgio, hdr);
2012}
2013EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2014
1722static void 2015static void
1723pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 2016pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
1724{ 2017{
1725 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 2018 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1726 struct pnfs_layout_segment *lseg = desc->pg_lseg; 2019 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1727 enum pnfs_try_status trypnfs; 2020 enum pnfs_try_status trypnfs;
2021 int err = 0;
1728 2022
1729 desc->pg_lseg = NULL;
1730 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); 2023 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
1731 if (trypnfs == PNFS_NOT_ATTEMPTED) 2024 if (trypnfs == PNFS_TRY_AGAIN)
2025 err = pnfs_read_resend_pnfs(hdr);
2026 if (trypnfs == PNFS_NOT_ATTEMPTED || err)
1732 pnfs_read_through_mds(desc, hdr); 2027 pnfs_read_through_mds(desc, hdr);
1733 pnfs_put_lseg(lseg);
1734} 2028}
1735 2029
1736static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 2030static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1743int 2037int
1744pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 2038pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1745{ 2039{
2040 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2041
1746 struct nfs_pgio_header *hdr; 2042 struct nfs_pgio_header *hdr;
1747 int ret; 2043 int ret;
1748 2044
1749 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 2045 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1750 if (!hdr) { 2046 if (!hdr) {
1751 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 2047 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1752 ret = -ENOMEM; 2048 return -ENOMEM;
1753 pnfs_put_lseg(desc->pg_lseg);
1754 desc->pg_lseg = NULL;
1755 return ret;
1756 } 2049 }
1757 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 2050 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1758 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 2051 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1759 ret = nfs_generic_pgio(desc, hdr); 2052 ret = nfs_generic_pgio(desc, hdr);
1760 if (ret != 0) { 2053 if (!ret)
1761 pnfs_put_lseg(desc->pg_lseg);
1762 desc->pg_lseg = NULL;
1763 } else
1764 pnfs_do_read(desc, hdr); 2054 pnfs_do_read(desc, hdr);
1765 return ret; 2055 return ret;
1766} 2056}
@@ -1966,6 +2256,7 @@ clear_layoutcommitting:
1966 pnfs_clear_layoutcommitting(inode); 2256 pnfs_clear_layoutcommitting(inode);
1967 goto out; 2257 goto out;
1968} 2258}
2259EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
1969 2260
1970struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 2261struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
1971{ 2262{
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9ae5b765b073..797cd6253adf 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -38,6 +38,25 @@ enum {
38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
39 NFS_LSEG_ROC, /* roc bit received from server */ 39 NFS_LSEG_ROC, /* roc bit received from server */
40 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ 40 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
41 NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */
42};
43
44/* Individual ip address */
45struct nfs4_pnfs_ds_addr {
46 struct sockaddr_storage da_addr;
47 size_t da_addrlen;
48 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
49 char *da_remotestr; /* human readable addr+port */
50};
51
52struct nfs4_pnfs_ds {
53 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
54 char *ds_remotestr; /* comma sep list of addrs */
55 struct list_head ds_addrs;
56 struct nfs_client *ds_clp;
57 atomic_t ds_count;
58 unsigned long ds_state;
59#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
41}; 60};
42 61
43struct pnfs_layout_segment { 62struct pnfs_layout_segment {
@@ -53,19 +72,34 @@ struct pnfs_layout_segment {
53enum pnfs_try_status { 72enum pnfs_try_status {
54 PNFS_ATTEMPTED = 0, 73 PNFS_ATTEMPTED = 0,
55 PNFS_NOT_ATTEMPTED = 1, 74 PNFS_NOT_ATTEMPTED = 1,
75 PNFS_TRY_AGAIN = 2,
56}; 76};
57 77
58#ifdef CONFIG_NFS_V4_1 78#ifdef CONFIG_NFS_V4_1
59 79
60#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" 80#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
61 81
82/*
83 * Default data server connection timeout and retrans vaules.
84 * Set by module parameters dataserver_timeo and dataserver_retrans.
85 */
86#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
87#define NFS4_DEF_DS_RETRANS 5
88
89/* error codes for internal use */
90#define NFS4ERR_RESET_TO_MDS 12001
91#define NFS4ERR_RESET_TO_PNFS 12002
92
62enum { 93enum {
63 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ 94 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
64 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 95 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 96 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 97 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 98 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
99 NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
68 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ 100 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
101 NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
102 NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
69}; 103};
70 104
71enum layoutdriver_policy_flags { 105enum layoutdriver_policy_flags {
@@ -106,7 +140,8 @@ struct pnfs_layoutdriver_type {
106 struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); 140 struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
107 void (*mark_request_commit) (struct nfs_page *req, 141 void (*mark_request_commit) (struct nfs_page *req,
108 struct pnfs_layout_segment *lseg, 142 struct pnfs_layout_segment *lseg,
109 struct nfs_commit_info *cinfo); 143 struct nfs_commit_info *cinfo,
144 u32 ds_commit_idx);
110 void (*clear_request_commit) (struct nfs_page *req, 145 void (*clear_request_commit) (struct nfs_page *req,
111 struct nfs_commit_info *cinfo); 146 struct nfs_commit_info *cinfo);
112 int (*scan_commit_lists) (struct nfs_commit_info *cinfo, 147 int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
@@ -154,6 +189,7 @@ struct pnfs_layout_hdr {
154 u32 plh_barrier; /* ignore lower seqids */ 189 u32 plh_barrier; /* ignore lower seqids */
155 unsigned long plh_retry_timestamp; 190 unsigned long plh_retry_timestamp;
156 unsigned long plh_flags; 191 unsigned long plh_flags;
192 enum pnfs_iomode plh_return_iomode;
157 loff_t plh_lwb; /* last write byte for layoutcommit */ 193 loff_t plh_lwb; /* last write byte for layoutcommit */
158 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 194 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
159 struct inode *plh_inode; 195 struct inode *plh_inode;
@@ -185,7 +221,7 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
185 struct pnfs_device *dev, 221 struct pnfs_device *dev,
186 struct rpc_cred *cred); 222 struct rpc_cred *cred);
187extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 223extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
188extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 224extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
189 225
190/* pnfs.c */ 226/* pnfs.c */
191void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); 227void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -198,6 +234,7 @@ void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *
198int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); 234int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
199void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 235void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
200 struct nfs_page *req, u64 wb_size); 236 struct nfs_page *req, u64 wb_size);
237void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
201int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); 238int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
202size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, 239size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
203 struct nfs_page *prev, struct nfs_page *req); 240 struct nfs_page *prev, struct nfs_page *req);
@@ -217,6 +254,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
217 bool update_barrier); 254 bool update_barrier);
218int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, 255int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
219 struct pnfs_layout_hdr *lo, 256 struct pnfs_layout_hdr *lo,
257 struct pnfs_layout_range *range,
220 struct nfs4_state *open_state); 258 struct nfs4_state *open_state);
221int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 259int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
222 struct list_head *tmp_list, 260 struct list_head *tmp_list,
@@ -233,17 +271,21 @@ int _pnfs_return_layout(struct inode *);
233int pnfs_commit_and_return_layout(struct inode *); 271int pnfs_commit_and_return_layout(struct inode *);
234void pnfs_ld_write_done(struct nfs_pgio_header *); 272void pnfs_ld_write_done(struct nfs_pgio_header *);
235void pnfs_ld_read_done(struct nfs_pgio_header *); 273void pnfs_ld_read_done(struct nfs_pgio_header *);
274int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
236struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 275struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
237 struct nfs_open_context *ctx, 276 struct nfs_open_context *ctx,
238 loff_t pos, 277 loff_t pos,
239 u64 count, 278 u64 count,
240 enum pnfs_iomode iomode, 279 enum pnfs_iomode iomode,
241 gfp_t gfp_flags); 280 gfp_t gfp_flags);
281void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
242 282
243void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); 283void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
244int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); 284int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
245int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); 285int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
246struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); 286struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
287void pnfs_error_mark_layout_for_return(struct inode *inode,
288 struct pnfs_layout_segment *lseg);
247 289
248/* nfs4_deviceid_flags */ 290/* nfs4_deviceid_flags */
249enum { 291enum {
@@ -275,6 +317,39 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
275bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); 317bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
276void nfs4_deviceid_purge_client(const struct nfs_client *); 318void nfs4_deviceid_purge_client(const struct nfs_client *);
277 319
320/* pnfs_nfs.c */
321void pnfs_generic_clear_request_commit(struct nfs_page *req,
322 struct nfs_commit_info *cinfo);
323void pnfs_generic_commit_release(void *calldata);
324void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
325void pnfs_generic_rw_release(void *data);
326void pnfs_generic_recover_commit_reqs(struct list_head *dst,
327 struct nfs_commit_info *cinfo);
328int pnfs_generic_commit_pagelist(struct inode *inode,
329 struct list_head *mds_pages,
330 int how,
331 struct nfs_commit_info *cinfo,
332 int (*initiate_commit)(struct nfs_commit_data *data,
333 int how));
334int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
335void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
336void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
337struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
338 gfp_t gfp_flags);
339void nfs4_pnfs_v3_ds_connect_unload(void);
340void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
341 struct nfs4_deviceid_node *devid, unsigned int timeo,
342 unsigned int retrans, u32 version, u32 minor_version,
343 rpc_authflavor_t au_flavor);
344struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
345 struct xdr_stream *xdr,
346 gfp_t gfp_flags);
347
348static inline bool nfs_have_layout(struct inode *inode)
349{
350 return NFS_I(inode)->layout != NULL;
351}
352
278static inline struct nfs4_deviceid_node * 353static inline struct nfs4_deviceid_node *
279nfs4_get_deviceid(struct nfs4_deviceid_node *d) 354nfs4_get_deviceid(struct nfs4_deviceid_node *d)
280{ 355{
@@ -282,6 +357,26 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
282 return d; 357 return d;
283} 358}
284 359
360static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
361{
362 if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
363 atomic_inc(&lo->plh_refcount);
364}
365
366static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
367{
368 if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
369 atomic_dec(&lo->plh_refcount);
370 /* wake up waiters for LAYOUTRETURN as that is not needed */
371 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
372 }
373}
374
375static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
376{
377 return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
378}
379
285static inline struct pnfs_layout_segment * 380static inline struct pnfs_layout_segment *
286pnfs_get_lseg(struct pnfs_layout_segment *lseg) 381pnfs_get_lseg(struct pnfs_layout_segment *lseg)
287{ 382{
@@ -317,16 +412,22 @@ pnfs_get_ds_info(struct inode *inode)
317 return ld->get_ds_info(inode); 412 return ld->get_ds_info(inode);
318} 413}
319 414
415static inline void
416pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
417{
418 set_bit(NFS_DEVICEID_INVALID, &node->flags);
419}
420
320static inline bool 421static inline bool
321pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 422pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
322 struct nfs_commit_info *cinfo) 423 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
323{ 424{
324 struct inode *inode = req->wb_context->dentry->d_inode; 425 struct inode *inode = req->wb_context->dentry->d_inode;
325 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 426 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
326 427
327 if (lseg == NULL || ld->mark_request_commit == NULL) 428 if (lseg == NULL || ld->mark_request_commit == NULL)
328 return false; 429 return false;
329 ld->mark_request_commit(req, lseg, cinfo); 430 ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
330 return true; 431 return true;
331} 432}
332 433
@@ -352,15 +453,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
352 return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); 453 return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
353} 454}
354 455
355static inline void
356pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
357 struct nfs_commit_info *cinfo)
358{
359 if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
360 return;
361 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
362}
363
364static inline struct nfs_page * 456static inline struct nfs_page *
365pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, 457pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
366 struct page *page) 458 struct page *page)
@@ -427,6 +519,11 @@ static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
427#endif /* NFS_DEBUG */ 519#endif /* NFS_DEBUG */
428#else /* CONFIG_NFS_V4_1 */ 520#else /* CONFIG_NFS_V4_1 */
429 521
522static inline bool nfs_have_layout(struct inode *inode)
523{
524 return false;
525}
526
430static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 527static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
431{ 528{
432} 529}
@@ -513,7 +610,7 @@ pnfs_get_ds_info(struct inode *inode)
513 610
514static inline bool 611static inline bool
515pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 612pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
516 struct nfs_commit_info *cinfo) 613 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
517{ 614{
518 return false; 615 return false;
519} 616}
@@ -531,12 +628,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
531 return 0; 628 return 0;
532} 629}
533 630
534static inline void
535pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
536 struct nfs_commit_info *cinfo)
537{
538}
539
540static inline struct nfs_page * 631static inline struct nfs_page *
541pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, 632pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
542 struct page *page) 633 struct page *page)
@@ -568,6 +659,10 @@ static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
568 return NULL; 659 return NULL;
569} 660}
570 661
662static inline void nfs4_pnfs_v3_ds_connect_unload(void)
663{
664}
665
571#endif /* CONFIG_NFS_V4_1 */ 666#endif /* CONFIG_NFS_V4_1 */
572 667
573#endif /* FS_NFS_PNFS_H */ 668#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000000..fdc4f6562bb7
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,840 @@
1/*
2 * Common NFS I/O operations for the pnfs file based
3 * layout drivers.
4 *
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
6 *
7 * Tom Haynes <loghyr@primarydata.com>
8 */
9
10#include <linux/nfs_fs.h>
11#include <linux/nfs_page.h>
12#include <linux/sunrpc/addr.h>
13#include <linux/module.h>
14
15#include "nfs4session.h"
16#include "internal.h"
17#include "pnfs.h"
18
19#define NFSDBG_FACILITY NFSDBG_PNFS
20
21void pnfs_generic_rw_release(void *data)
22{
23 struct nfs_pgio_header *hdr = data;
24
25 nfs_put_client(hdr->ds_clp);
26 hdr->mds_ops->rpc_release(data);
27}
28EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
29
30/* Fake up some data that will cause nfs_commit_release to retry the writes. */
31void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
32{
33 struct nfs_page *first = nfs_list_entry(data->pages.next);
34
35 data->task.tk_status = 0;
36 memcpy(&data->verf.verifier, &first->wb_verf,
37 sizeof(data->verf.verifier));
38 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
39}
40EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
41
42void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
43{
44 struct nfs_commit_data *wdata = data;
45
46 /* Note this may cause RPC to be resent */
47 wdata->mds_ops->rpc_call_done(task, data);
48}
49EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
50
51void pnfs_generic_commit_release(void *calldata)
52{
53 struct nfs_commit_data *data = calldata;
54
55 data->completion_ops->completion(data);
56 pnfs_put_lseg(data->lseg);
57 nfs_put_client(data->ds_clp);
58 nfs_commitdata_release(data);
59}
60EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
61
62/* The generic layer is about to remove the req from the commit list.
63 * If this will make the bucket empty, it will need to put the lseg reference.
64 * Note this must be called holding the inode (/cinfo) lock
65 */
66void
67pnfs_generic_clear_request_commit(struct nfs_page *req,
68 struct nfs_commit_info *cinfo)
69{
70 struct pnfs_layout_segment *freeme = NULL;
71
72 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
73 goto out;
74 cinfo->ds->nwritten--;
75 if (list_is_singular(&req->wb_list)) {
76 struct pnfs_commit_bucket *bucket;
77
78 bucket = list_first_entry(&req->wb_list,
79 struct pnfs_commit_bucket,
80 written);
81 freeme = bucket->wlseg;
82 bucket->wlseg = NULL;
83 }
84out:
85 nfs_request_remove_commit_list(req, cinfo);
86 pnfs_put_lseg_locked(freeme);
87}
88EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
89
90static int
91pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
92 struct nfs_commit_info *cinfo, int max)
93{
94 struct nfs_page *req, *tmp;
95 int ret = 0;
96
97 list_for_each_entry_safe(req, tmp, src, wb_list) {
98 if (!nfs_lock_request(req))
99 continue;
100 kref_get(&req->wb_kref);
101 if (cond_resched_lock(cinfo->lock))
102 list_safe_reset_next(req, tmp, wb_list);
103 nfs_request_remove_commit_list(req, cinfo);
104 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
105 nfs_list_add_request(req, dst);
106 ret++;
107 if ((ret == max) && !cinfo->dreq)
108 break;
109 }
110 return ret;
111}
112
113static int
114pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
115 struct nfs_commit_info *cinfo,
116 int max)
117{
118 struct list_head *src = &bucket->written;
119 struct list_head *dst = &bucket->committing;
120 int ret;
121
122 lockdep_assert_held(cinfo->lock);
123 ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
124 if (ret) {
125 cinfo->ds->nwritten -= ret;
126 cinfo->ds->ncommitting += ret;
127 bucket->clseg = bucket->wlseg;
128 if (list_empty(src))
129 bucket->wlseg = NULL;
130 else
131 pnfs_get_lseg(bucket->clseg);
132 }
133 return ret;
134}
135
136/* Move reqs from written to committing lists, returning count
137 * of number moved.
138 */
139int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
140 int max)
141{
142 int i, rv = 0, cnt;
143
144 lockdep_assert_held(cinfo->lock);
145 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
146 cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
147 cinfo, max);
148 max -= cnt;
149 rv += cnt;
150 }
151 return rv;
152}
153EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
154
155/* Pull everything off the committing lists and dump into @dst. */
156void pnfs_generic_recover_commit_reqs(struct list_head *dst,
157 struct nfs_commit_info *cinfo)
158{
159 struct pnfs_commit_bucket *b;
160 struct pnfs_layout_segment *freeme;
161 int i;
162
163 lockdep_assert_held(cinfo->lock);
164restart:
165 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
166 if (pnfs_generic_transfer_commit_list(&b->written, dst,
167 cinfo, 0)) {
168 freeme = b->wlseg;
169 b->wlseg = NULL;
170 spin_unlock(cinfo->lock);
171 pnfs_put_lseg(freeme);
172 spin_lock(cinfo->lock);
173 goto restart;
174 }
175 }
176 cinfo->ds->nwritten = 0;
177}
178EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
179
180static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
181{
182 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
183 struct pnfs_commit_bucket *bucket;
184 struct pnfs_layout_segment *freeme;
185 int i;
186
187 for (i = idx; i < fl_cinfo->nbuckets; i++) {
188 bucket = &fl_cinfo->buckets[i];
189 if (list_empty(&bucket->committing))
190 continue;
191 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
192 spin_lock(cinfo->lock);
193 freeme = bucket->clseg;
194 bucket->clseg = NULL;
195 spin_unlock(cinfo->lock);
196 pnfs_put_lseg(freeme);
197 }
198}
199
200static unsigned int
201pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
202 struct list_head *list)
203{
204 struct pnfs_ds_commit_info *fl_cinfo;
205 struct pnfs_commit_bucket *bucket;
206 struct nfs_commit_data *data;
207 int i;
208 unsigned int nreq = 0;
209
210 fl_cinfo = cinfo->ds;
211 bucket = fl_cinfo->buckets;
212 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
213 if (list_empty(&bucket->committing))
214 continue;
215 data = nfs_commitdata_alloc();
216 if (!data)
217 break;
218 data->ds_commit_index = i;
219 spin_lock(cinfo->lock);
220 data->lseg = bucket->clseg;
221 bucket->clseg = NULL;
222 spin_unlock(cinfo->lock);
223 list_add(&data->pages, list);
224 nreq++;
225 }
226
227 /* Clean up on error */
228 pnfs_generic_retry_commit(cinfo, i);
229 return nreq;
230}
231
232/* This follows nfs_commit_list pretty closely */
233int
234pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
235 int how, struct nfs_commit_info *cinfo,
236 int (*initiate_commit)(struct nfs_commit_data *data,
237 int how))
238{
239 struct nfs_commit_data *data, *tmp;
240 LIST_HEAD(list);
241 unsigned int nreq = 0;
242
243 if (!list_empty(mds_pages)) {
244 data = nfs_commitdata_alloc();
245 if (data != NULL) {
246 data->lseg = NULL;
247 list_add(&data->pages, &list);
248 nreq++;
249 } else {
250 nfs_retry_commit(mds_pages, NULL, cinfo, 0);
251 pnfs_generic_retry_commit(cinfo, 0);
252 cinfo->completion_ops->error_cleanup(NFS_I(inode));
253 return -ENOMEM;
254 }
255 }
256
257 nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
258
259 if (nreq == 0) {
260 cinfo->completion_ops->error_cleanup(NFS_I(inode));
261 goto out;
262 }
263
264 atomic_add(nreq, &cinfo->mds->rpcs_out);
265
266 list_for_each_entry_safe(data, tmp, &list, pages) {
267 list_del_init(&data->pages);
268 if (!data->lseg) {
269 nfs_init_commit(data, mds_pages, NULL, cinfo);
270 nfs_initiate_commit(NFS_CLIENT(inode), data,
271 NFS_PROTO(data->inode),
272 data->mds_ops, how, 0);
273 } else {
274 struct pnfs_commit_bucket *buckets;
275
276 buckets = cinfo->ds->buckets;
277 nfs_init_commit(data,
278 &buckets[data->ds_commit_index].committing,
279 data->lseg,
280 cinfo);
281 initiate_commit(data, how);
282 }
283 }
284out:
285 cinfo->ds->ncommitting = 0;
286 return PNFS_ATTEMPTED;
287}
288EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
289
290/*
291 * Data server cache
292 *
293 * Data servers can be mapped to different device ids.
294 * nfs4_pnfs_ds reference counting
295 * - set to 1 on allocation
296 * - incremented when a device id maps a data server already in the cache.
297 * - decremented when deviceid is removed from the cache.
298 */
299static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
300static LIST_HEAD(nfs4_data_server_cache);
301
302/* Debug routines */
303static void
304print_ds(struct nfs4_pnfs_ds *ds)
305{
306 if (ds == NULL) {
307 printk(KERN_WARNING "%s NULL device\n", __func__);
308 return;
309 }
310 printk(KERN_WARNING " ds %s\n"
311 " ref count %d\n"
312 " client %p\n"
313 " cl_exchange_flags %x\n",
314 ds->ds_remotestr,
315 atomic_read(&ds->ds_count), ds->ds_clp,
316 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
317}
318
319static bool
320same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
321{
322 struct sockaddr_in *a, *b;
323 struct sockaddr_in6 *a6, *b6;
324
325 if (addr1->sa_family != addr2->sa_family)
326 return false;
327
328 switch (addr1->sa_family) {
329 case AF_INET:
330 a = (struct sockaddr_in *)addr1;
331 b = (struct sockaddr_in *)addr2;
332
333 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
334 a->sin_port == b->sin_port)
335 return true;
336 break;
337
338 case AF_INET6:
339 a6 = (struct sockaddr_in6 *)addr1;
340 b6 = (struct sockaddr_in6 *)addr2;
341
342 /* LINKLOCAL addresses must have matching scope_id */
343 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
344 IPV6_ADDR_SCOPE_LINKLOCAL &&
345 a6->sin6_scope_id != b6->sin6_scope_id)
346 return false;
347
348 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
349 a6->sin6_port == b6->sin6_port)
350 return true;
351 break;
352
353 default:
354 dprintk("%s: unhandled address family: %u\n",
355 __func__, addr1->sa_family);
356 return false;
357 }
358
359 return false;
360}
361
362static bool
363_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
364 const struct list_head *dsaddrs2)
365{
366 struct nfs4_pnfs_ds_addr *da1, *da2;
367
368 /* step through both lists, comparing as we go */
369 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
370 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
371 da1 != NULL && da2 != NULL;
372 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
373 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
374 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
375 (struct sockaddr *)&da2->da_addr))
376 return false;
377 }
378 if (da1 == NULL && da2 == NULL)
379 return true;
380
381 return false;
382}
383
384/*
385 * Lookup DS by addresses. nfs4_ds_cache_lock is held
386 */
387static struct nfs4_pnfs_ds *
388_data_server_lookup_locked(const struct list_head *dsaddrs)
389{
390 struct nfs4_pnfs_ds *ds;
391
392 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
393 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
394 return ds;
395 return NULL;
396}
397
398static void destroy_ds(struct nfs4_pnfs_ds *ds)
399{
400 struct nfs4_pnfs_ds_addr *da;
401
402 dprintk("--> %s\n", __func__);
403 ifdebug(FACILITY)
404 print_ds(ds);
405
406 nfs_put_client(ds->ds_clp);
407
408 while (!list_empty(&ds->ds_addrs)) {
409 da = list_first_entry(&ds->ds_addrs,
410 struct nfs4_pnfs_ds_addr,
411 da_node);
412 list_del_init(&da->da_node);
413 kfree(da->da_remotestr);
414 kfree(da);
415 }
416
417 kfree(ds->ds_remotestr);
418 kfree(ds);
419}
420
421void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
422{
423 if (atomic_dec_and_lock(&ds->ds_count,
424 &nfs4_ds_cache_lock)) {
425 list_del_init(&ds->ds_node);
426 spin_unlock(&nfs4_ds_cache_lock);
427 destroy_ds(ds);
428 }
429}
430EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
431
432/*
433 * Create a string with a human readable address and port to avoid
434 * complicated setup around many dprinks.
435 */
436static char *
437nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
438{
439 struct nfs4_pnfs_ds_addr *da;
440 char *remotestr;
441 size_t len;
442 char *p;
443
444 len = 3; /* '{', '}' and eol */
445 list_for_each_entry(da, dsaddrs, da_node) {
446 len += strlen(da->da_remotestr) + 1; /* string plus comma */
447 }
448
449 remotestr = kzalloc(len, gfp_flags);
450 if (!remotestr)
451 return NULL;
452
453 p = remotestr;
454 *(p++) = '{';
455 len--;
456 list_for_each_entry(da, dsaddrs, da_node) {
457 size_t ll = strlen(da->da_remotestr);
458
459 if (ll > len)
460 goto out_err;
461
462 memcpy(p, da->da_remotestr, ll);
463 p += ll;
464 len -= ll;
465
466 if (len < 1)
467 goto out_err;
468 (*p++) = ',';
469 len--;
470 }
471 if (len < 2)
472 goto out_err;
473 *(p++) = '}';
474 *p = '\0';
475 return remotestr;
476out_err:
477 kfree(remotestr);
478 return NULL;
479}
480
481/*
482 * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
483 * uncached and return cached struct nfs4_pnfs_ds.
484 */
485struct nfs4_pnfs_ds *
486nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
487{
488 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
489 char *remotestr;
490
491 if (list_empty(dsaddrs)) {
492 dprintk("%s: no addresses defined\n", __func__);
493 goto out;
494 }
495
496 ds = kzalloc(sizeof(*ds), gfp_flags);
497 if (!ds)
498 goto out;
499
500 /* this is only used for debugging, so it's ok if its NULL */
501 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
502
503 spin_lock(&nfs4_ds_cache_lock);
504 tmp_ds = _data_server_lookup_locked(dsaddrs);
505 if (tmp_ds == NULL) {
506 INIT_LIST_HEAD(&ds->ds_addrs);
507 list_splice_init(dsaddrs, &ds->ds_addrs);
508 ds->ds_remotestr = remotestr;
509 atomic_set(&ds->ds_count, 1);
510 INIT_LIST_HEAD(&ds->ds_node);
511 ds->ds_clp = NULL;
512 list_add(&ds->ds_node, &nfs4_data_server_cache);
513 dprintk("%s add new data server %s\n", __func__,
514 ds->ds_remotestr);
515 } else {
516 kfree(remotestr);
517 kfree(ds);
518 atomic_inc(&tmp_ds->ds_count);
519 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
520 __func__, tmp_ds->ds_remotestr,
521 atomic_read(&tmp_ds->ds_count));
522 ds = tmp_ds;
523 }
524 spin_unlock(&nfs4_ds_cache_lock);
525out:
526 return ds;
527}
528EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
529
530static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
531{
532 might_sleep();
533 wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
534 TASK_KILLABLE);
535}
536
537static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
538{
539 smp_mb__before_atomic();
540 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
541 smp_mb__after_atomic();
542 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
543}
544
545static struct nfs_client *(*get_v3_ds_connect)(
546 struct nfs_client *mds_clp,
547 const struct sockaddr *ds_addr,
548 int ds_addrlen,
549 int ds_proto,
550 unsigned int ds_timeo,
551 unsigned int ds_retrans,
552 rpc_authflavor_t au_flavor);
553
554static bool load_v3_ds_connect(void)
555{
556 if (!get_v3_ds_connect) {
557 get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
558 WARN_ON_ONCE(!get_v3_ds_connect);
559 }
560
561 return(get_v3_ds_connect != NULL);
562}
563
564void __exit nfs4_pnfs_v3_ds_connect_unload(void)
565{
566 if (get_v3_ds_connect) {
567 symbol_put(nfs3_set_ds_client);
568 get_v3_ds_connect = NULL;
569 }
570}
571EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
572
573static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
574 struct nfs4_pnfs_ds *ds,
575 unsigned int timeo,
576 unsigned int retrans,
577 rpc_authflavor_t au_flavor)
578{
579 struct nfs_client *clp = ERR_PTR(-EIO);
580 struct nfs4_pnfs_ds_addr *da;
581 int status = 0;
582
583 dprintk("--> %s DS %s au_flavor %d\n", __func__,
584 ds->ds_remotestr, au_flavor);
585
586 if (!load_v3_ds_connect())
587 goto out;
588
589 list_for_each_entry(da, &ds->ds_addrs, da_node) {
590 dprintk("%s: DS %s: trying address %s\n",
591 __func__, ds->ds_remotestr, da->da_remotestr);
592
593 clp = get_v3_ds_connect(mds_srv->nfs_client,
594 (struct sockaddr *)&da->da_addr,
595 da->da_addrlen, IPPROTO_TCP,
596 timeo, retrans, au_flavor);
597 if (!IS_ERR(clp))
598 break;
599 }
600
601 if (IS_ERR(clp)) {
602 status = PTR_ERR(clp);
603 goto out;
604 }
605
606 smp_wmb();
607 ds->ds_clp = clp;
608 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
609out:
610 return status;
611}
612
613static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
614 struct nfs4_pnfs_ds *ds,
615 unsigned int timeo,
616 unsigned int retrans,
617 u32 minor_version,
618 rpc_authflavor_t au_flavor)
619{
620 struct nfs_client *clp = ERR_PTR(-EIO);
621 struct nfs4_pnfs_ds_addr *da;
622 int status = 0;
623
624 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
625 au_flavor);
626
627 list_for_each_entry(da, &ds->ds_addrs, da_node) {
628 dprintk("%s: DS %s: trying address %s\n",
629 __func__, ds->ds_remotestr, da->da_remotestr);
630
631 clp = nfs4_set_ds_client(mds_srv->nfs_client,
632 (struct sockaddr *)&da->da_addr,
633 da->da_addrlen, IPPROTO_TCP,
634 timeo, retrans, minor_version,
635 au_flavor);
636 if (!IS_ERR(clp))
637 break;
638 }
639
640 if (IS_ERR(clp)) {
641 status = PTR_ERR(clp);
642 goto out;
643 }
644
645 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
646 if (status)
647 goto out_put;
648
649 smp_wmb();
650 ds->ds_clp = clp;
651 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
652out:
653 return status;
654out_put:
655 nfs_put_client(clp);
656 goto out;
657}
658
659/*
660 * Create an rpc connection to the nfs4_pnfs_ds data server.
661 * Currently only supports IPv4 and IPv6 addresses.
662 * If connection fails, make devid unavailable.
663 */
664void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
665 struct nfs4_deviceid_node *devid, unsigned int timeo,
666 unsigned int retrans, u32 version,
667 u32 minor_version, rpc_authflavor_t au_flavor)
668{
669 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
670 int err = 0;
671
672 if (version == 3) {
673 err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
674 retrans, au_flavor);
675 } else if (version == 4) {
676 err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
677 retrans, minor_version,
678 au_flavor);
679 } else {
680 dprintk("%s: unsupported DS version %d\n", __func__,
681 version);
682 err = -EPROTONOSUPPORT;
683 }
684
685 if (err)
686 nfs4_mark_deviceid_unavailable(devid);
687 nfs4_clear_ds_conn_bit(ds);
688 } else {
689 nfs4_wait_ds_connect(ds);
690 }
691}
692EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
693
694/*
695 * Currently only supports ipv4, ipv6 and one multi-path address.
696 */
697struct nfs4_pnfs_ds_addr *
698nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
699{
700 struct nfs4_pnfs_ds_addr *da = NULL;
701 char *buf, *portstr;
702 __be16 port;
703 int nlen, rlen;
704 int tmp[2];
705 __be32 *p;
706 char *netid, *match_netid;
707 size_t len, match_netid_len;
708 char *startsep = "";
709 char *endsep = "";
710
711
712 /* r_netid */
713 p = xdr_inline_decode(xdr, 4);
714 if (unlikely(!p))
715 goto out_err;
716 nlen = be32_to_cpup(p++);
717
718 p = xdr_inline_decode(xdr, nlen);
719 if (unlikely(!p))
720 goto out_err;
721
722 netid = kmalloc(nlen+1, gfp_flags);
723 if (unlikely(!netid))
724 goto out_err;
725
726 netid[nlen] = '\0';
727 memcpy(netid, p, nlen);
728
729 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
730 p = xdr_inline_decode(xdr, 4);
731 if (unlikely(!p))
732 goto out_free_netid;
733 rlen = be32_to_cpup(p);
734
735 p = xdr_inline_decode(xdr, rlen);
736 if (unlikely(!p))
737 goto out_free_netid;
738
739 /* port is ".ABC.DEF", 8 chars max */
740 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
741 dprintk("%s: Invalid address, length %d\n", __func__,
742 rlen);
743 goto out_free_netid;
744 }
745 buf = kmalloc(rlen + 1, gfp_flags);
746 if (!buf) {
747 dprintk("%s: Not enough memory\n", __func__);
748 goto out_free_netid;
749 }
750 buf[rlen] = '\0';
751 memcpy(buf, p, rlen);
752
753 /* replace port '.' with '-' */
754 portstr = strrchr(buf, '.');
755 if (!portstr) {
756 dprintk("%s: Failed finding expected dot in port\n",
757 __func__);
758 goto out_free_buf;
759 }
760 *portstr = '-';
761
762 /* find '.' between address and port */
763 portstr = strrchr(buf, '.');
764 if (!portstr) {
765 dprintk("%s: Failed finding expected dot between address and "
766 "port\n", __func__);
767 goto out_free_buf;
768 }
769 *portstr = '\0';
770
771 da = kzalloc(sizeof(*da), gfp_flags);
772 if (unlikely(!da))
773 goto out_free_buf;
774
775 INIT_LIST_HEAD(&da->da_node);
776
777 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
778 sizeof(da->da_addr))) {
779 dprintk("%s: error parsing address %s\n", __func__, buf);
780 goto out_free_da;
781 }
782
783 portstr++;
784 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
785 port = htons((tmp[0] << 8) | (tmp[1]));
786
787 switch (da->da_addr.ss_family) {
788 case AF_INET:
789 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
790 da->da_addrlen = sizeof(struct sockaddr_in);
791 match_netid = "tcp";
792 match_netid_len = 3;
793 break;
794
795 case AF_INET6:
796 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
797 da->da_addrlen = sizeof(struct sockaddr_in6);
798 match_netid = "tcp6";
799 match_netid_len = 4;
800 startsep = "[";
801 endsep = "]";
802 break;
803
804 default:
805 dprintk("%s: unsupported address family: %u\n",
806 __func__, da->da_addr.ss_family);
807 goto out_free_da;
808 }
809
810 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
811 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
812 __func__, netid, match_netid);
813 goto out_free_da;
814 }
815
816 /* save human readable address */
817 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
818 da->da_remotestr = kzalloc(len, gfp_flags);
819
820 /* NULL is ok, only used for dprintk */
821 if (da->da_remotestr)
822 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
823 buf, endsep, ntohs(port));
824
825 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
826 kfree(buf);
827 kfree(netid);
828 return da;
829
830out_free_da:
831 kfree(da);
832out_free_buf:
833 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
834 kfree(buf);
835out_free_netid:
836 kfree(netid);
837out_err:
838 return NULL;
839}
840EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c91a4799c562..568ecf0a880f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
70 70
71void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) 71void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
72{ 72{
73 struct nfs_pgio_mirror *mirror;
74
73 pgio->pg_ops = &nfs_pgio_rw_ops; 75 pgio->pg_ops = &nfs_pgio_rw_ops;
74 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; 76
77 /* read path should never have more than one mirror */
78 WARN_ON_ONCE(pgio->pg_mirror_count != 1);
79
80 mirror = &pgio->pg_mirrors[0];
81 mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
75} 82}
76EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); 83EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
77 84
@@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
81 struct nfs_page *new; 88 struct nfs_page *new;
82 unsigned int len; 89 unsigned int len;
83 struct nfs_pageio_descriptor pgio; 90 struct nfs_pageio_descriptor pgio;
91 struct nfs_pgio_mirror *pgm;
84 92
85 len = nfs_page_length(page); 93 len = nfs_page_length(page);
86 if (len == 0) 94 if (len == 0)
@@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
97 &nfs_async_read_completion_ops); 105 &nfs_async_read_completion_ops);
98 nfs_pageio_add_request(&pgio, new); 106 nfs_pageio_add_request(&pgio, new);
99 nfs_pageio_complete(&pgio); 107 nfs_pageio_complete(&pgio);
100 NFS_I(inode)->read_io += pgio.pg_bytes_written; 108
109 /* It doesn't make sense to do mirrored reads! */
110 WARN_ON_ONCE(pgio.pg_mirror_count != 1);
111
112 pgm = &pgio.pg_mirrors[0];
113 NFS_I(inode)->read_io += pgm->pg_bytes_written;
114
101 return 0; 115 return 0;
102} 116}
103 117
@@ -168,13 +182,14 @@ out:
168 182
169static void nfs_initiate_read(struct nfs_pgio_header *hdr, 183static void nfs_initiate_read(struct nfs_pgio_header *hdr,
170 struct rpc_message *msg, 184 struct rpc_message *msg,
185 const struct nfs_rpc_ops *rpc_ops,
171 struct rpc_task_setup *task_setup_data, int how) 186 struct rpc_task_setup *task_setup_data, int how)
172{ 187{
173 struct inode *inode = hdr->inode; 188 struct inode *inode = hdr->inode;
174 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 189 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
175 190
176 task_setup_data->flags |= swap_flags; 191 task_setup_data->flags |= swap_flags;
177 NFS_PROTO(inode)->read_setup(hdr, msg); 192 rpc_ops->read_setup(hdr, msg);
178} 193}
179 194
180static void 195static void
@@ -351,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
351 struct list_head *pages, unsigned nr_pages) 366 struct list_head *pages, unsigned nr_pages)
352{ 367{
353 struct nfs_pageio_descriptor pgio; 368 struct nfs_pageio_descriptor pgio;
369 struct nfs_pgio_mirror *pgm;
354 struct nfs_readdesc desc = { 370 struct nfs_readdesc desc = {
355 .pgio = &pgio, 371 .pgio = &pgio,
356 }; 372 };
@@ -386,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
386 &nfs_async_read_completion_ops); 402 &nfs_async_read_completion_ops);
387 403
388 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 404 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
389
390 nfs_pageio_complete(&pgio); 405 nfs_pageio_complete(&pgio);
391 NFS_I(inode)->read_io += pgio.pg_bytes_written; 406
392 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 407 /* It doesn't make sense to do mirrored reads! */
408 WARN_ON_ONCE(pgio.pg_mirror_count != 1);
409
410 pgm = &pgio.pg_mirrors[0];
411 NFS_I(inode)->read_io += pgm->pg_bytes_written;
412 npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
413 PAGE_CACHE_SHIFT;
393 nfs_add_stats(inode, NFSIOS_READPAGES, npages); 414 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
394read_complete: 415read_complete:
395 put_nfs_open_context(desc.ctx); 416 put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 31a11b0e885d..368d9395d2e7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -405,12 +405,15 @@ void __exit unregister_nfs_fs(void)
405 unregister_filesystem(&nfs_fs_type); 405 unregister_filesystem(&nfs_fs_type);
406} 406}
407 407
408void nfs_sb_active(struct super_block *sb) 408bool nfs_sb_active(struct super_block *sb)
409{ 409{
410 struct nfs_server *server = NFS_SB(sb); 410 struct nfs_server *server = NFS_SB(sb);
411 411
412 if (atomic_inc_return(&server->active) == 1) 412 if (!atomic_inc_not_zero(&sb->s_active))
413 atomic_inc(&sb->s_active); 413 return false;
414 if (atomic_inc_return(&server->active) != 1)
415 atomic_dec(&sb->s_active);
416 return true;
414} 417}
415EXPORT_SYMBOL_GPL(nfs_sb_active); 418EXPORT_SYMBOL_GPL(nfs_sb_active);
416 419
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4ae66f416eb9..bcf83e535f29 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -473,13 +473,18 @@ try_again:
473 do { 473 do {
474 /* 474 /*
475 * Subrequests are always contiguous, non overlapping 475 * Subrequests are always contiguous, non overlapping
476 * and in order. If not, it's a programming error. 476 * and in order - but may be repeated (mirrored writes).
477 */ 477 */
478 WARN_ON_ONCE(subreq->wb_offset != 478 if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
479 (head->wb_offset + total_bytes)); 479 /* keep track of how many bytes this group covers */
480 480 total_bytes += subreq->wb_bytes;
481 /* keep track of how many bytes this group covers */ 481 } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
482 total_bytes += subreq->wb_bytes; 482 ((subreq->wb_offset + subreq->wb_bytes) >
483 (head->wb_offset + total_bytes)))) {
484 nfs_page_group_unlock(head);
485 spin_unlock(&inode->i_lock);
486 return ERR_PTR(-EIO);
487 }
483 488
484 if (!nfs_lock_request(subreq)) { 489 if (!nfs_lock_request(subreq)) {
485 /* releases page group bit lock and 490 /* releases page group bit lock and
@@ -842,9 +847,9 @@ EXPORT_SYMBOL_GPL(nfs_init_cinfo);
842 */ 847 */
843void 848void
844nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 849nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
845 struct nfs_commit_info *cinfo) 850 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
846{ 851{
847 if (pnfs_mark_request_commit(req, lseg, cinfo)) 852 if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
848 return; 853 return;
849 nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); 854 nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
850} 855}
@@ -900,7 +905,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
900 } 905 }
901 if (nfs_write_need_commit(hdr)) { 906 if (nfs_write_need_commit(hdr)) {
902 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); 907 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
903 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 908 nfs_mark_request_commit(req, hdr->lseg, &cinfo,
909 hdr->pgio_mirror_idx);
904 goto next; 910 goto next;
905 } 911 }
906remove_req: 912remove_req:
@@ -1269,15 +1275,15 @@ static int flush_task_priority(int how)
1269 1275
1270static void nfs_initiate_write(struct nfs_pgio_header *hdr, 1276static void nfs_initiate_write(struct nfs_pgio_header *hdr,
1271 struct rpc_message *msg, 1277 struct rpc_message *msg,
1278 const struct nfs_rpc_ops *rpc_ops,
1272 struct rpc_task_setup *task_setup_data, int how) 1279 struct rpc_task_setup *task_setup_data, int how)
1273{ 1280{
1274 struct inode *inode = hdr->inode;
1275 int priority = flush_task_priority(how); 1281 int priority = flush_task_priority(how);
1276 1282
1277 task_setup_data->priority = priority; 1283 task_setup_data->priority = priority;
1278 NFS_PROTO(inode)->write_setup(hdr, msg); 1284 rpc_ops->write_setup(hdr, msg);
1279 1285
1280 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, 1286 nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
1281 &task_setup_data->rpc_client, msg, hdr); 1287 &task_setup_data->rpc_client, msg, hdr);
1282} 1288}
1283 1289
@@ -1327,8 +1333,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
1327 1333
1328void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) 1334void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
1329{ 1335{
1336 struct nfs_pgio_mirror *mirror;
1337
1330 pgio->pg_ops = &nfs_pgio_rw_ops; 1338 pgio->pg_ops = &nfs_pgio_rw_ops;
1331 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; 1339
1340 nfs_pageio_stop_mirroring(pgio);
1341
1342 mirror = &pgio->pg_mirrors[0];
1343 mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
1332} 1344}
1333EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); 1345EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
1334 1346
@@ -1494,6 +1506,7 @@ void nfs_commitdata_release(struct nfs_commit_data *data)
1494EXPORT_SYMBOL_GPL(nfs_commitdata_release); 1506EXPORT_SYMBOL_GPL(nfs_commitdata_release);
1495 1507
1496int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, 1508int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1509 const struct nfs_rpc_ops *nfs_ops,
1497 const struct rpc_call_ops *call_ops, 1510 const struct rpc_call_ops *call_ops,
1498 int how, int flags) 1511 int how, int flags)
1499{ 1512{
@@ -1515,7 +1528,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1515 .priority = priority, 1528 .priority = priority,
1516 }; 1529 };
1517 /* Set up the initial task struct. */ 1530 /* Set up the initial task struct. */
1518 NFS_PROTO(data->inode)->commit_setup(data, &msg); 1531 nfs_ops->commit_setup(data, &msg);
1519 1532
1520 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1533 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1521 1534
@@ -1583,14 +1596,15 @@ EXPORT_SYMBOL_GPL(nfs_init_commit);
1583 1596
1584void nfs_retry_commit(struct list_head *page_list, 1597void nfs_retry_commit(struct list_head *page_list,
1585 struct pnfs_layout_segment *lseg, 1598 struct pnfs_layout_segment *lseg,
1586 struct nfs_commit_info *cinfo) 1599 struct nfs_commit_info *cinfo,
1600 u32 ds_commit_idx)
1587{ 1601{
1588 struct nfs_page *req; 1602 struct nfs_page *req;
1589 1603
1590 while (!list_empty(page_list)) { 1604 while (!list_empty(page_list)) {
1591 req = nfs_list_entry(page_list->next); 1605 req = nfs_list_entry(page_list->next);
1592 nfs_list_remove_request(req); 1606 nfs_list_remove_request(req);
1593 nfs_mark_request_commit(req, lseg, cinfo); 1607 nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
1594 if (!cinfo->dreq) { 1608 if (!cinfo->dreq) {
1595 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1609 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1596 dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, 1610 dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
@@ -1618,10 +1632,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
1618 /* Set up the argument struct */ 1632 /* Set up the argument struct */
1619 nfs_init_commit(data, head, NULL, cinfo); 1633 nfs_init_commit(data, head, NULL, cinfo);
1620 atomic_inc(&cinfo->mds->rpcs_out); 1634 atomic_inc(&cinfo->mds->rpcs_out);
1621 return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, 1635 return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
1622 how, 0); 1636 data->mds_ops, how, 0);
1623 out_bad: 1637 out_bad:
1624 nfs_retry_commit(head, NULL, cinfo); 1638 nfs_retry_commit(head, NULL, cinfo, 0);
1625 cinfo->completion_ops->error_cleanup(NFS_I(inode)); 1639 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1626 return -ENOMEM; 1640 return -ENOMEM;
1627} 1641}
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 022b761dbf0a..de7c91ca427e 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -516,6 +516,7 @@ enum pnfs_layouttype {
516 LAYOUT_NFSV4_1_FILES = 1, 516 LAYOUT_NFSV4_1_FILES = 1,
517 LAYOUT_OSD2_OBJECTS = 2, 517 LAYOUT_OSD2_OBJECTS = 2,
518 LAYOUT_BLOCK_VOLUME = 3, 518 LAYOUT_BLOCK_VOLUME = 3,
519 LAYOUT_FLEX_FILES = 4,
519}; 520};
520 521
521/* used for both layout return and recall */ 522/* used for both layout return and recall */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index ddea982355f3..5e1273d4de14 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -77,10 +77,6 @@ struct nfs_client {
77 /* Client owner identifier */ 77 /* Client owner identifier */
78 const char * cl_owner_id; 78 const char * cl_owner_id;
79 79
80 /* Our own IP address, as a null-terminated string.
81 * This is used to generate the mv0 callback address.
82 */
83 char cl_ipaddr[48];
84 u32 cl_cb_ident; /* v4.0 callback identifier */ 80 u32 cl_cb_ident; /* v4.0 callback identifier */
85 const struct nfs4_minor_version_ops *cl_mvops; 81 const struct nfs4_minor_version_ops *cl_mvops;
86 unsigned long cl_mig_gen; 82 unsigned long cl_mig_gen;
@@ -108,6 +104,11 @@ struct nfs_client {
108#define NFS_SP4_MACH_CRED_COMMIT 6 /* COMMIT */ 104#define NFS_SP4_MACH_CRED_COMMIT 6 /* COMMIT */
109#endif /* CONFIG_NFS_V4 */ 105#endif /* CONFIG_NFS_V4 */
110 106
107 /* Our own IP address, as a null-terminated string.
108 * This is used to generate the mv0 callback address.
109 */
110 char cl_ipaddr[48];
111
111#ifdef CONFIG_NFS_FSCACHE 112#ifdef CONFIG_NFS_FSCACHE
112 struct fscache_cookie *fscache; /* client index cache cookie */ 113 struct fscache_cookie *fscache; /* client index cache cookie */
113#endif 114#endif
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index 0f4b79da6584..333844e38f66 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -73,5 +73,7 @@ int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t
73int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t); 73int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t);
74int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t); 74int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t);
75 75
76int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res);
77
76extern unsigned int nfs_idmap_cache_timeout; 78extern unsigned int nfs_idmap_cache_timeout;
77#endif /* NFS_IDMAP_H */ 79#endif /* NFS_IDMAP_H */
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 6c3e06ee2fb7..3eb072dbce83 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -58,6 +58,9 @@ struct nfs_pageio_ops {
58 size_t (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, 58 size_t (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *,
59 struct nfs_page *); 59 struct nfs_page *);
60 int (*pg_doio)(struct nfs_pageio_descriptor *); 60 int (*pg_doio)(struct nfs_pageio_descriptor *);
61 unsigned int (*pg_get_mirror_count)(struct nfs_pageio_descriptor *,
62 struct nfs_page *);
63 void (*pg_cleanup)(struct nfs_pageio_descriptor *);
61}; 64};
62 65
63struct nfs_rw_ops { 66struct nfs_rw_ops {
@@ -69,18 +72,21 @@ struct nfs_rw_ops {
69 struct inode *); 72 struct inode *);
70 void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *); 73 void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
71 void (*rw_initiate)(struct nfs_pgio_header *, struct rpc_message *, 74 void (*rw_initiate)(struct nfs_pgio_header *, struct rpc_message *,
75 const struct nfs_rpc_ops *,
72 struct rpc_task_setup *, int); 76 struct rpc_task_setup *, int);
73}; 77};
74 78
75struct nfs_pageio_descriptor { 79struct nfs_pgio_mirror {
76 struct list_head pg_list; 80 struct list_head pg_list;
77 unsigned long pg_bytes_written; 81 unsigned long pg_bytes_written;
78 size_t pg_count; 82 size_t pg_count;
79 size_t pg_bsize; 83 size_t pg_bsize;
80 unsigned int pg_base; 84 unsigned int pg_base;
81 unsigned char pg_moreio : 1, 85 unsigned char pg_recoalesce : 1;
82 pg_recoalesce : 1; 86};
83 87
88struct nfs_pageio_descriptor {
89 unsigned char pg_moreio : 1;
84 struct inode *pg_inode; 90 struct inode *pg_inode;
85 const struct nfs_pageio_ops *pg_ops; 91 const struct nfs_pageio_ops *pg_ops;
86 const struct nfs_rw_ops *pg_rw_ops; 92 const struct nfs_rw_ops *pg_rw_ops;
@@ -91,8 +97,18 @@ struct nfs_pageio_descriptor {
91 struct pnfs_layout_segment *pg_lseg; 97 struct pnfs_layout_segment *pg_lseg;
92 struct nfs_direct_req *pg_dreq; 98 struct nfs_direct_req *pg_dreq;
93 void *pg_layout_private; 99 void *pg_layout_private;
100 unsigned int pg_bsize; /* default bsize for mirrors */
101
102 u32 pg_mirror_count;
103 struct nfs_pgio_mirror *pg_mirrors;
104 struct nfs_pgio_mirror pg_mirrors_static[1];
105 struct nfs_pgio_mirror *pg_mirrors_dynamic;
106 u32 pg_mirror_idx; /* current mirror */
94}; 107};
95 108
109/* arbitrarily selected limit to number of mirrors */
110#define NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX 16
111
96#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) 112#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
97 113
98extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx, 114extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 467c84efb596..38d96ba935c2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -285,6 +285,7 @@ struct nfs4_layoutcommit_data {
285 struct nfs_fattr fattr; 285 struct nfs_fattr fattr;
286 struct list_head lseg_list; 286 struct list_head lseg_list;
287 struct rpc_cred *cred; 287 struct rpc_cred *cred;
288 struct inode *inode;
288 struct nfs4_layoutcommit_args args; 289 struct nfs4_layoutcommit_args args;
289 struct nfs4_layoutcommit_res res; 290 struct nfs4_layoutcommit_res res;
290}; 291};
@@ -293,6 +294,7 @@ struct nfs4_layoutreturn_args {
293 struct nfs4_sequence_args seq_args; 294 struct nfs4_sequence_args seq_args;
294 struct pnfs_layout_hdr *layout; 295 struct pnfs_layout_hdr *layout;
295 struct inode *inode; 296 struct inode *inode;
297 struct pnfs_layout_range range;
296 nfs4_stateid stateid; 298 nfs4_stateid stateid;
297 __u32 layout_type; 299 __u32 layout_type;
298}; 300};
@@ -308,6 +310,7 @@ struct nfs4_layoutreturn {
308 struct nfs4_layoutreturn_res res; 310 struct nfs4_layoutreturn_res res;
309 struct rpc_cred *cred; 311 struct rpc_cred *cred;
310 struct nfs_client *clp; 312 struct nfs_client *clp;
313 struct inode *inode;
311 int rpc_status; 314 int rpc_status;
312}; 315};
313 316
@@ -325,6 +328,7 @@ struct nfs_openargs {
325 struct nfs_seqid * seqid; 328 struct nfs_seqid * seqid;
326 int open_flags; 329 int open_flags;
327 fmode_t fmode; 330 fmode_t fmode;
331 u32 share_access;
328 u32 access; 332 u32 access;
329 __u64 clientid; 333 __u64 clientid;
330 struct stateowner_id id; 334 struct stateowner_id id;
@@ -389,9 +393,10 @@ struct nfs_open_confirmres {
389struct nfs_closeargs { 393struct nfs_closeargs {
390 struct nfs4_sequence_args seq_args; 394 struct nfs4_sequence_args seq_args;
391 struct nfs_fh * fh; 395 struct nfs_fh * fh;
392 nfs4_stateid * stateid; 396 nfs4_stateid stateid;
393 struct nfs_seqid * seqid; 397 struct nfs_seqid * seqid;
394 fmode_t fmode; 398 fmode_t fmode;
399 u32 share_access;
395 const u32 * bitmask; 400 const u32 * bitmask;
396}; 401};
397 402
@@ -416,12 +421,13 @@ struct nfs_lock_args {
416 struct nfs_fh * fh; 421 struct nfs_fh * fh;
417 struct file_lock * fl; 422 struct file_lock * fl;
418 struct nfs_seqid * lock_seqid; 423 struct nfs_seqid * lock_seqid;
419 nfs4_stateid * lock_stateid; 424 nfs4_stateid lock_stateid;
420 struct nfs_seqid * open_seqid; 425 struct nfs_seqid * open_seqid;
421 nfs4_stateid * open_stateid; 426 nfs4_stateid open_stateid;
422 struct nfs_lowner lock_owner; 427 struct nfs_lowner lock_owner;
423 unsigned char block : 1; 428 unsigned char block : 1;
424 unsigned char reclaim : 1; 429 unsigned char reclaim : 1;
430 unsigned char new_lock : 1;
425 unsigned char new_lock_owner : 1; 431 unsigned char new_lock_owner : 1;
426}; 432};
427 433
@@ -437,7 +443,7 @@ struct nfs_locku_args {
437 struct nfs_fh * fh; 443 struct nfs_fh * fh;
438 struct file_lock * fl; 444 struct file_lock * fl;
439 struct nfs_seqid * seqid; 445 struct nfs_seqid * seqid;
440 nfs4_stateid * stateid; 446 nfs4_stateid stateid;
441}; 447};
442 448
443struct nfs_locku_res { 449struct nfs_locku_res {
@@ -513,6 +519,7 @@ struct nfs_pgio_res {
513 struct nfs4_sequence_res seq_res; 519 struct nfs4_sequence_res seq_res;
514 struct nfs_fattr * fattr; 520 struct nfs_fattr * fattr;
515 __u32 count; 521 __u32 count;
522 __u32 op_status;
516 int eof; /* used by read */ 523 int eof; /* used by read */
517 struct nfs_writeverf * verf; /* used by write */ 524 struct nfs_writeverf * verf; /* used by write */
518 const struct nfs_server *server; /* used by write */ 525 const struct nfs_server *server; /* used by write */
@@ -532,6 +539,7 @@ struct nfs_commitargs {
532 539
533struct nfs_commitres { 540struct nfs_commitres {
534 struct nfs4_sequence_res seq_res; 541 struct nfs4_sequence_res seq_res;
542 __u32 op_status;
535 struct nfs_fattr *fattr; 543 struct nfs_fattr *fattr;
536 struct nfs_writeverf *verf; 544 struct nfs_writeverf *verf;
537 const struct nfs_server *server; 545 const struct nfs_server *server;
@@ -1325,7 +1333,8 @@ struct nfs_pgio_header {
1325 __u64 mds_offset; /* Filelayout dense stripe */ 1333 __u64 mds_offset; /* Filelayout dense stripe */
1326 struct nfs_page_array page_array; 1334 struct nfs_page_array page_array;
1327 struct nfs_client *ds_clp; /* pNFS data server */ 1335 struct nfs_client *ds_clp; /* pNFS data server */
1328 int ds_idx; /* ds index if ds_clp is set */ 1336 int ds_commit_idx; /* ds index if ds_clp is set */
1337 int pgio_mirror_idx;/* mirror index in pgio layer */
1329}; 1338};
1330 1339
1331struct nfs_mds_commit_info { 1340struct nfs_mds_commit_info {
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index d86acc63b25f..598ba80ec30c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -57,7 +57,7 @@ struct rpc_clnt {
57 const struct rpc_timeout *cl_timeout; /* Timeout strategy */ 57 const struct rpc_timeout *cl_timeout; /* Timeout strategy */
58 58
59 int cl_nodelen; /* nodename length */ 59 int cl_nodelen; /* nodename length */
60 char cl_nodename[UNX_MAXNODENAME]; 60 char cl_nodename[UNX_MAXNODENAME+1];
61 struct rpc_pipe_dir_head cl_pipedir_objects; 61 struct rpc_pipe_dir_head cl_pipedir_objects;
62 struct rpc_clnt * cl_parent; /* Points to parent of clones */ 62 struct rpc_clnt * cl_parent; /* Points to parent of clones */
63 struct rpc_rtt cl_rtt_default; 63 struct rpc_rtt cl_rtt_default;
@@ -112,6 +112,7 @@ struct rpc_create_args {
112 struct sockaddr *saddress; 112 struct sockaddr *saddress;
113 const struct rpc_timeout *timeout; 113 const struct rpc_timeout *timeout;
114 const char *servername; 114 const char *servername;
115 const char *nodename;
115 const struct rpc_program *program; 116 const struct rpc_program *program;
116 u32 prognumber; /* overrides program->number */ 117 u32 prognumber; /* overrides program->number */
117 u32 version; 118 u32 version;
diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index eecb5a71e6c0..7e61a17030a4 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -79,6 +79,8 @@ struct rpc_clnt;
79struct rpc_iostats * rpc_alloc_iostats(struct rpc_clnt *); 79struct rpc_iostats * rpc_alloc_iostats(struct rpc_clnt *);
80void rpc_count_iostats(const struct rpc_task *, 80void rpc_count_iostats(const struct rpc_task *,
81 struct rpc_iostats *); 81 struct rpc_iostats *);
82void rpc_count_iostats_metrics(const struct rpc_task *,
83 struct rpc_iostats *);
82void rpc_print_iostats(struct seq_file *, struct rpc_clnt *); 84void rpc_print_iostats(struct seq_file *, struct rpc_clnt *);
83void rpc_free_iostats(struct rpc_iostats *); 85void rpc_free_iostats(struct rpc_iostats *);
84 86
@@ -87,6 +89,8 @@ void rpc_free_iostats(struct rpc_iostats *);
87static inline struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { return NULL; } 89static inline struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { return NULL; }
88static inline void rpc_count_iostats(const struct rpc_task *task, 90static inline void rpc_count_iostats(const struct rpc_task *task,
89 struct rpc_iostats *stats) {} 91 struct rpc_iostats *stats) {}
92static inline void rpc_count_iostats_metrics(const struct rpc_task *,
93 struct rpc_iostats *) {}
90static inline void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) {} 94static inline void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) {}
91static inline void rpc_free_iostats(struct rpc_iostats *stats) {} 95static inline void rpc_free_iostats(struct rpc_iostats *stats) {}
92 96
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index b78f16b1dea3..f33c5a4d6fe4 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -42,6 +42,9 @@
42 42
43#include <linux/types.h> 43#include <linux/types.h>
44 44
45#define RPCRDMA_VERSION 1
46#define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION)
47
45struct rpcrdma_segment { 48struct rpcrdma_segment {
46 __be32 rs_handle; /* Registered memory handle */ 49 __be32 rs_handle; /* Registered memory handle */
47 __be32 rs_length; /* Length of the chunk in bytes */ 50 __be32 rs_length; /* Length of the chunk in bytes */
@@ -95,7 +98,10 @@ struct rpcrdma_msg {
95 } rm_body; 98 } rm_body;
96}; 99};
97 100
98#define RPCRDMA_HDRLEN_MIN 28 101/*
102 * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
103 */
104#define RPCRDMA_HDRLEN_MIN (sizeof(__be32) * 7)
99 105
100enum rpcrdma_errcode { 106enum rpcrdma_errcode {
101 ERR_VERS = 1, 107 ERR_VERS = 1,
@@ -115,4 +121,10 @@ enum rpcrdma_proc {
115 RDMA_ERROR = 4 /* An RPC RDMA encoding error */ 121 RDMA_ERROR = 4 /* An RPC RDMA encoding error */
116}; 122};
117 123
124#define rdma_msg cpu_to_be32(RDMA_MSG)
125#define rdma_nomsg cpu_to_be32(RDMA_NOMSG)
126#define rdma_msgp cpu_to_be32(RDMA_MSGP)
127#define rdma_done cpu_to_be32(RDMA_DONE)
128#define rdma_error cpu_to_be32(RDMA_ERROR)
129
118#endif /* _LINUX_SUNRPC_RPC_RDMA_H */ 130#endif /* _LINUX_SUNRPC_RPC_RDMA_H */
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 975da754c778..ddfe88f52219 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -63,8 +63,6 @@ extern atomic_t rdma_stat_rq_prod;
63extern atomic_t rdma_stat_sq_poll; 63extern atomic_t rdma_stat_sq_poll;
64extern atomic_t rdma_stat_sq_prod; 64extern atomic_t rdma_stat_sq_prod;
65 65
66#define RPCRDMA_VERSION 1
67
68/* 66/*
69 * Contexts are built when an RDMA request is created and are a 67 * Contexts are built when an RDMA request is created and are a
70 * record of the resources that can be recovered when the request 68 * record of the resources that can be recovered when the request
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 9d27ac45b909..8b93ef53df3c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -347,6 +347,9 @@ void xprt_force_disconnect(struct rpc_xprt *xprt);
347void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); 347void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
348int xs_swapper(struct rpc_xprt *xprt, int enable); 348int xs_swapper(struct rpc_xprt *xprt, int enable);
349 349
350bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
351void xprt_unlock_connect(struct rpc_xprt *, void *);
352
350/* 353/*
351 * Reserved bit positions in xprt->state 354 * Reserved bit positions in xprt->state
352 */ 355 */
@@ -357,10 +360,7 @@ int xs_swapper(struct rpc_xprt *xprt, int enable);
357#define XPRT_BOUND (4) 360#define XPRT_BOUND (4)
358#define XPRT_BINDING (5) 361#define XPRT_BINDING (5)
359#define XPRT_CLOSING (6) 362#define XPRT_CLOSING (6)
360#define XPRT_CONNECTION_ABORT (7)
361#define XPRT_CONNECTION_CLOSE (8)
362#define XPRT_CONGESTED (9) 363#define XPRT_CONGESTED (9)
363#define XPRT_CONNECTION_REUSE (10)
364 364
365static inline void xprt_set_connected(struct rpc_xprt *xprt) 365static inline void xprt_set_connected(struct rpc_xprt *xprt)
366{ 366{
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 05da12a33945..612aa73bbc60 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -286,10 +286,8 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt,
286 286
287static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) 287static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
288{ 288{
289 clnt->cl_nodelen = strlen(nodename); 289 clnt->cl_nodelen = strlcpy(clnt->cl_nodename,
290 if (clnt->cl_nodelen > UNX_MAXNODENAME) 290 nodename, sizeof(clnt->cl_nodename));
291 clnt->cl_nodelen = UNX_MAXNODENAME;
292 memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen);
293} 291}
294 292
295static int rpc_client_register(struct rpc_clnt *clnt, 293static int rpc_client_register(struct rpc_clnt *clnt,
@@ -365,6 +363,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
365 const struct rpc_version *version; 363 const struct rpc_version *version;
366 struct rpc_clnt *clnt = NULL; 364 struct rpc_clnt *clnt = NULL;
367 const struct rpc_timeout *timeout; 365 const struct rpc_timeout *timeout;
366 const char *nodename = args->nodename;
368 int err; 367 int err;
369 368
370 /* sanity check the name before trying to print it */ 369 /* sanity check the name before trying to print it */
@@ -420,8 +419,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
420 419
421 atomic_set(&clnt->cl_count, 1); 420 atomic_set(&clnt->cl_count, 1);
422 421
422 if (nodename == NULL)
423 nodename = utsname()->nodename;
423 /* save the nodename */ 424 /* save the nodename */
424 rpc_clnt_set_nodename(clnt, utsname()->nodename); 425 rpc_clnt_set_nodename(clnt, nodename);
425 426
426 err = rpc_client_register(clnt, args->authflavor, args->client_name); 427 err = rpc_client_register(clnt, args->authflavor, args->client_name);
427 if (err) 428 if (err)
@@ -576,6 +577,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
576 if (xprt == NULL) 577 if (xprt == NULL)
577 goto out_err; 578 goto out_err;
578 args->servername = xprt->servername; 579 args->servername = xprt->servername;
580 args->nodename = clnt->cl_nodename;
579 581
580 new = rpc_new_client(args, xprt, clnt); 582 new = rpc_new_client(args, xprt, clnt);
581 if (IS_ERR(new)) { 583 if (IS_ERR(new)) {
@@ -1824,6 +1826,7 @@ call_connect_status(struct rpc_task *task)
1824 case -ECONNABORTED: 1826 case -ECONNABORTED:
1825 case -ENETUNREACH: 1827 case -ENETUNREACH:
1826 case -EHOSTUNREACH: 1828 case -EHOSTUNREACH:
1829 case -EADDRINUSE:
1827 case -ENOBUFS: 1830 case -ENOBUFS:
1828 case -EPIPE: 1831 case -EPIPE:
1829 if (RPC_IS_SOFTCONN(task)) 1832 if (RPC_IS_SOFTCONN(task))
@@ -1932,6 +1935,7 @@ call_transmit_status(struct rpc_task *task)
1932 } 1935 }
1933 case -ECONNRESET: 1936 case -ECONNRESET:
1934 case -ECONNABORTED: 1937 case -ECONNABORTED:
1938 case -EADDRINUSE:
1935 case -ENOTCONN: 1939 case -ENOTCONN:
1936 case -ENOBUFS: 1940 case -ENOBUFS:
1937 case -EPIPE: 1941 case -EPIPE:
@@ -2051,6 +2055,7 @@ call_status(struct rpc_task *task)
2051 case -ECONNRESET: 2055 case -ECONNRESET:
2052 case -ECONNABORTED: 2056 case -ECONNABORTED:
2053 rpc_force_rebind(clnt); 2057 rpc_force_rebind(clnt);
2058 case -EADDRINUSE:
2054 case -ENOBUFS: 2059 case -ENOBUFS:
2055 rpc_delay(task, 3*HZ); 2060 rpc_delay(task, 3*HZ);
2056 case -EPIPE: 2061 case -EPIPE:
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 05202012bcfc..cf5770d8f49a 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -355,7 +355,8 @@ out:
355 return result; 355 return result;
356} 356}
357 357
358static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname, 358static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename,
359 const char *hostname,
359 struct sockaddr *srvaddr, size_t salen, 360 struct sockaddr *srvaddr, size_t salen,
360 int proto, u32 version) 361 int proto, u32 version)
361{ 362{
@@ -365,6 +366,7 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname,
365 .address = srvaddr, 366 .address = srvaddr,
366 .addrsize = salen, 367 .addrsize = salen,
367 .servername = hostname, 368 .servername = hostname,
369 .nodename = nodename,
368 .program = &rpcb_program, 370 .program = &rpcb_program,
369 .version = version, 371 .version = version,
370 .authflavor = RPC_AUTH_UNIX, 372 .authflavor = RPC_AUTH_UNIX,
@@ -740,7 +742,9 @@ void rpcb_getport_async(struct rpc_task *task)
740 dprintk("RPC: %5u %s: trying rpcbind version %u\n", 742 dprintk("RPC: %5u %s: trying rpcbind version %u\n",
741 task->tk_pid, __func__, bind_version); 743 task->tk_pid, __func__, bind_version);
742 744
743 rpcb_clnt = rpcb_create(xprt->xprt_net, xprt->servername, sap, salen, 745 rpcb_clnt = rpcb_create(xprt->xprt_net,
746 clnt->cl_nodename,
747 xprt->servername, sap, salen,
744 xprt->prot, bind_version); 748 xprt->prot, bind_version);
745 if (IS_ERR(rpcb_clnt)) { 749 if (IS_ERR(rpcb_clnt)) {
746 status = PTR_ERR(rpcb_clnt); 750 status = PTR_ERR(rpcb_clnt);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index d20f2329eea3..b91fd9c597b4 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -844,10 +844,10 @@ static void rpc_async_schedule(struct work_struct *work)
844void *rpc_malloc(struct rpc_task *task, size_t size) 844void *rpc_malloc(struct rpc_task *task, size_t size)
845{ 845{
846 struct rpc_buffer *buf; 846 struct rpc_buffer *buf;
847 gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; 847 gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
848 848
849 if (RPC_IS_SWAPPER(task)) 849 if (RPC_IS_SWAPPER(task))
850 gfp |= __GFP_MEMALLOC; 850 gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
851 851
852 size += sizeof(struct rpc_buffer); 852 size += sizeof(struct rpc_buffer);
853 if (size <= RPC_BUFFER_MAXSIZE) 853 if (size <= RPC_BUFFER_MAXSIZE)
@@ -1069,7 +1069,8 @@ static int rpciod_start(void)
1069 * Create the rpciod thread and wait for it to start. 1069 * Create the rpciod thread and wait for it to start.
1070 */ 1070 */
1071 dprintk("RPC: creating workqueue rpciod\n"); 1071 dprintk("RPC: creating workqueue rpciod\n");
1072 wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 1); 1072 /* Note: highpri because network receive is latency sensitive */
1073 wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1073 rpciod_workqueue = wq; 1074 rpciod_workqueue = wq;
1074 return rpciod_workqueue != NULL; 1075 return rpciod_workqueue != NULL;
1075} 1076}
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 9711a155bc50..2ecb994314c1 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -140,22 +140,20 @@ void rpc_free_iostats(struct rpc_iostats *stats)
140EXPORT_SYMBOL_GPL(rpc_free_iostats); 140EXPORT_SYMBOL_GPL(rpc_free_iostats);
141 141
142/** 142/**
143 * rpc_count_iostats - tally up per-task stats 143 * rpc_count_iostats_metrics - tally up per-task stats
144 * @task: completed rpc_task 144 * @task: completed rpc_task
145 * @stats: array of stat structures 145 * @op_metrics: stat structure for OP that will accumulate stats from @task
146 */ 146 */
147void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) 147void rpc_count_iostats_metrics(const struct rpc_task *task,
148 struct rpc_iostats *op_metrics)
148{ 149{
149 struct rpc_rqst *req = task->tk_rqstp; 150 struct rpc_rqst *req = task->tk_rqstp;
150 struct rpc_iostats *op_metrics;
151 ktime_t delta, now; 151 ktime_t delta, now;
152 152
153 if (!stats || !req) 153 if (!op_metrics || !req)
154 return; 154 return;
155 155
156 now = ktime_get(); 156 now = ktime_get();
157 op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx];
158
159 spin_lock(&op_metrics->om_lock); 157 spin_lock(&op_metrics->om_lock);
160 158
161 op_metrics->om_ops++; 159 op_metrics->om_ops++;
@@ -175,6 +173,20 @@ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
175 173
176 spin_unlock(&op_metrics->om_lock); 174 spin_unlock(&op_metrics->om_lock);
177} 175}
176EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics);
177
178/**
179 * rpc_count_iostats - tally up per-task stats
180 * @task: completed rpc_task
181 * @stats: array of stat structures
182 *
183 * Uses the statidx from @task
184 */
185void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
186{
187 rpc_count_iostats_metrics(task,
188 &stats[task->tk_msg.rpc_proc->p_statidx]);
189}
178EXPORT_SYMBOL_GPL(rpc_count_iostats); 190EXPORT_SYMBOL_GPL(rpc_count_iostats);
179 191
180static void _print_name(struct seq_file *seq, unsigned int op, 192static void _print_name(struct seq_file *seq, unsigned int op,
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ebbefad21a37..e3015aede0d9 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -683,13 +683,43 @@ xprt_init_autodisconnect(unsigned long data)
683 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) 683 if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
684 goto out_abort; 684 goto out_abort;
685 spin_unlock(&xprt->transport_lock); 685 spin_unlock(&xprt->transport_lock);
686 set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
687 queue_work(rpciod_workqueue, &xprt->task_cleanup); 686 queue_work(rpciod_workqueue, &xprt->task_cleanup);
688 return; 687 return;
689out_abort: 688out_abort:
690 spin_unlock(&xprt->transport_lock); 689 spin_unlock(&xprt->transport_lock);
691} 690}
692 691
692bool xprt_lock_connect(struct rpc_xprt *xprt,
693 struct rpc_task *task,
694 void *cookie)
695{
696 bool ret = false;
697
698 spin_lock_bh(&xprt->transport_lock);
699 if (!test_bit(XPRT_LOCKED, &xprt->state))
700 goto out;
701 if (xprt->snd_task != task)
702 goto out;
703 xprt->snd_task = cookie;
704 ret = true;
705out:
706 spin_unlock_bh(&xprt->transport_lock);
707 return ret;
708}
709
710void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
711{
712 spin_lock_bh(&xprt->transport_lock);
713 if (xprt->snd_task != cookie)
714 goto out;
715 if (!test_bit(XPRT_LOCKED, &xprt->state))
716 goto out;
717 xprt->snd_task =NULL;
718 xprt->ops->release_xprt(xprt, NULL);
719out:
720 spin_unlock_bh(&xprt->transport_lock);
721}
722
693/** 723/**
694 * xprt_connect - schedule a transport connect operation 724 * xprt_connect - schedule a transport connect operation
695 * @task: RPC task that is requesting the connect 725 * @task: RPC task that is requesting the connect
@@ -712,9 +742,7 @@ void xprt_connect(struct rpc_task *task)
712 if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state)) 742 if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state))
713 xprt->ops->close(xprt); 743 xprt->ops->close(xprt);
714 744
715 if (xprt_connected(xprt)) 745 if (!xprt_connected(xprt)) {
716 xprt_release_write(xprt, task);
717 else {
718 task->tk_rqstp->rq_bytes_sent = 0; 746 task->tk_rqstp->rq_bytes_sent = 0;
719 task->tk_timeout = task->tk_rqstp->rq_timeout; 747 task->tk_timeout = task->tk_rqstp->rq_timeout;
720 rpc_sleep_on(&xprt->pending, task, xprt_connect_status); 748 rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
@@ -726,6 +754,7 @@ void xprt_connect(struct rpc_task *task)
726 xprt->stat.connect_start = jiffies; 754 xprt->stat.connect_start = jiffies;
727 xprt->ops->connect(xprt, task); 755 xprt->ops->connect(xprt, task);
728 } 756 }
757 xprt_release_write(xprt, task);
729} 758}
730 759
731static void xprt_connect_status(struct rpc_task *task) 760static void xprt_connect_status(struct rpc_task *task)
@@ -758,7 +787,6 @@ static void xprt_connect_status(struct rpc_task *task)
758 dprintk("RPC: %5u xprt_connect_status: error %d connecting to " 787 dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
759 "server %s\n", task->tk_pid, -task->tk_status, 788 "server %s\n", task->tk_pid, -task->tk_status,
760 xprt->servername); 789 xprt->servername);
761 xprt_release_write(xprt, task);
762 task->tk_status = -EIO; 790 task->tk_status = -EIO;
763 } 791 }
764} 792}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index df01d124936c..7e9acd9361c5 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -209,9 +209,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
209 if (cur_rchunk) { /* read */ 209 if (cur_rchunk) { /* read */
210 cur_rchunk->rc_discrim = xdr_one; 210 cur_rchunk->rc_discrim = xdr_one;
211 /* all read chunks have the same "position" */ 211 /* all read chunks have the same "position" */
212 cur_rchunk->rc_position = htonl(pos); 212 cur_rchunk->rc_position = cpu_to_be32(pos);
213 cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); 213 cur_rchunk->rc_target.rs_handle =
214 cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); 214 cpu_to_be32(seg->mr_rkey);
215 cur_rchunk->rc_target.rs_length =
216 cpu_to_be32(seg->mr_len);
215 xdr_encode_hyper( 217 xdr_encode_hyper(
216 (__be32 *)&cur_rchunk->rc_target.rs_offset, 218 (__be32 *)&cur_rchunk->rc_target.rs_offset,
217 seg->mr_base); 219 seg->mr_base);
@@ -222,8 +224,10 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
222 cur_rchunk++; 224 cur_rchunk++;
223 r_xprt->rx_stats.read_chunk_count++; 225 r_xprt->rx_stats.read_chunk_count++;
224 } else { /* write/reply */ 226 } else { /* write/reply */
225 cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); 227 cur_wchunk->wc_target.rs_handle =
226 cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); 228 cpu_to_be32(seg->mr_rkey);
229 cur_wchunk->wc_target.rs_length =
230 cpu_to_be32(seg->mr_len);
227 xdr_encode_hyper( 231 xdr_encode_hyper(
228 (__be32 *)&cur_wchunk->wc_target.rs_offset, 232 (__be32 *)&cur_wchunk->wc_target.rs_offset,
229 seg->mr_base); 233 seg->mr_base);
@@ -257,7 +261,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
257 *iptr++ = xdr_zero; /* encode a NULL reply chunk */ 261 *iptr++ = xdr_zero; /* encode a NULL reply chunk */
258 } else { 262 } else {
259 warray->wc_discrim = xdr_one; 263 warray->wc_discrim = xdr_one;
260 warray->wc_nchunks = htonl(nchunks); 264 warray->wc_nchunks = cpu_to_be32(nchunks);
261 iptr = (__be32 *) cur_wchunk; 265 iptr = (__be32 *) cur_wchunk;
262 if (type == rpcrdma_writech) { 266 if (type == rpcrdma_writech) {
263 *iptr++ = xdr_zero; /* finish the write chunk list */ 267 *iptr++ = xdr_zero; /* finish the write chunk list */
@@ -290,7 +294,7 @@ ssize_t
290rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) 294rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
291{ 295{
292 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 296 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
293 struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; 297 struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
294 298
295 if (req->rl_rtype != rpcrdma_noch) 299 if (req->rl_rtype != rpcrdma_noch)
296 result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, 300 result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
@@ -402,13 +406,12 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
402 base = rqst->rq_svec[0].iov_base; 406 base = rqst->rq_svec[0].iov_base;
403 rpclen = rqst->rq_svec[0].iov_len; 407 rpclen = rqst->rq_svec[0].iov_len;
404 408
405 /* build RDMA header in private area at front */ 409 headerp = rdmab_to_msg(req->rl_rdmabuf);
406 headerp = (struct rpcrdma_msg *) req->rl_base; 410 /* don't byte-swap XID, it's already done in request */
407 /* don't htonl XID, it's already done in request */
408 headerp->rm_xid = rqst->rq_xid; 411 headerp->rm_xid = rqst->rq_xid;
409 headerp->rm_vers = xdr_one; 412 headerp->rm_vers = rpcrdma_version;
410 headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); 413 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
411 headerp->rm_type = htonl(RDMA_MSG); 414 headerp->rm_type = rdma_msg;
412 415
413 /* 416 /*
414 * Chunks needed for results? 417 * Chunks needed for results?
@@ -468,7 +471,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
468 return -EIO; 471 return -EIO;
469 } 472 }
470 473
471 hdrlen = 28; /*sizeof *headerp;*/ 474 hdrlen = RPCRDMA_HDRLEN_MIN;
472 padlen = 0; 475 padlen = 0;
473 476
474 /* 477 /*
@@ -482,11 +485,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
482 RPCRDMA_INLINE_PAD_VALUE(rqst)); 485 RPCRDMA_INLINE_PAD_VALUE(rqst));
483 486
484 if (padlen) { 487 if (padlen) {
485 headerp->rm_type = htonl(RDMA_MSGP); 488 headerp->rm_type = rdma_msgp;
486 headerp->rm_body.rm_padded.rm_align = 489 headerp->rm_body.rm_padded.rm_align =
487 htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); 490 cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
488 headerp->rm_body.rm_padded.rm_thresh = 491 headerp->rm_body.rm_padded.rm_thresh =
489 htonl(RPCRDMA_INLINE_PAD_THRESH); 492 cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
490 headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; 493 headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
491 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 494 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
492 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 495 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
@@ -524,7 +527,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
524 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 527 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
525 " headerp 0x%p base 0x%p lkey 0x%x\n", 528 " headerp 0x%p base 0x%p lkey 0x%x\n",
526 __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, 529 __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
527 headerp, base, req->rl_iov.lkey); 530 headerp, base, rdmab_lkey(req->rl_rdmabuf));
528 531
529 /* 532 /*
530 * initialize send_iov's - normally only two: rdma chunk header and 533 * initialize send_iov's - normally only two: rdma chunk header and
@@ -533,26 +536,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
533 * header and any write data. In all non-rdma cases, any following 536 * header and any write data. In all non-rdma cases, any following
534 * data has been copied into the RPC header buffer. 537 * data has been copied into the RPC header buffer.
535 */ 538 */
536 req->rl_send_iov[0].addr = req->rl_iov.addr; 539 req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
537 req->rl_send_iov[0].length = hdrlen; 540 req->rl_send_iov[0].length = hdrlen;
538 req->rl_send_iov[0].lkey = req->rl_iov.lkey; 541 req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
539 542
540 req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); 543 req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
541 req->rl_send_iov[1].length = rpclen; 544 req->rl_send_iov[1].length = rpclen;
542 req->rl_send_iov[1].lkey = req->rl_iov.lkey; 545 req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
543 546
544 req->rl_niovs = 2; 547 req->rl_niovs = 2;
545 548
546 if (padlen) { 549 if (padlen) {
547 struct rpcrdma_ep *ep = &r_xprt->rx_ep; 550 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
548 551
549 req->rl_send_iov[2].addr = ep->rep_pad.addr; 552 req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
550 req->rl_send_iov[2].length = padlen; 553 req->rl_send_iov[2].length = padlen;
551 req->rl_send_iov[2].lkey = ep->rep_pad.lkey; 554 req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
552 555
553 req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; 556 req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
554 req->rl_send_iov[3].length = rqst->rq_slen - rpclen; 557 req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
555 req->rl_send_iov[3].lkey = req->rl_iov.lkey; 558 req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
556 559
557 req->rl_niovs = 4; 560 req->rl_niovs = 4;
558 } 561 }
@@ -569,8 +572,9 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
569{ 572{
570 unsigned int i, total_len; 573 unsigned int i, total_len;
571 struct rpcrdma_write_chunk *cur_wchunk; 574 struct rpcrdma_write_chunk *cur_wchunk;
575 char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
572 576
573 i = ntohl(**iptrp); /* get array count */ 577 i = be32_to_cpu(**iptrp);
574 if (i > max) 578 if (i > max)
575 return -1; 579 return -1;
576 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 580 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
@@ -582,11 +586,11 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
582 xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); 586 xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
583 dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", 587 dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
584 __func__, 588 __func__,
585 ntohl(seg->rs_length), 589 be32_to_cpu(seg->rs_length),
586 (unsigned long long)off, 590 (unsigned long long)off,
587 ntohl(seg->rs_handle)); 591 be32_to_cpu(seg->rs_handle));
588 } 592 }
589 total_len += ntohl(seg->rs_length); 593 total_len += be32_to_cpu(seg->rs_length);
590 ++cur_wchunk; 594 ++cur_wchunk;
591 } 595 }
592 /* check and adjust for properly terminated write chunk */ 596 /* check and adjust for properly terminated write chunk */
@@ -596,7 +600,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
596 return -1; 600 return -1;
597 cur_wchunk = (struct rpcrdma_write_chunk *) w; 601 cur_wchunk = (struct rpcrdma_write_chunk *) w;
598 } 602 }
599 if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) 603 if ((char *)cur_wchunk > base + rep->rr_len)
600 return -1; 604 return -1;
601 605
602 *iptrp = (__be32 *) cur_wchunk; 606 *iptrp = (__be32 *) cur_wchunk;
@@ -691,7 +695,9 @@ rpcrdma_connect_worker(struct work_struct *work)
691{ 695{
692 struct rpcrdma_ep *ep = 696 struct rpcrdma_ep *ep =
693 container_of(work, struct rpcrdma_ep, rep_connect_worker.work); 697 container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
694 struct rpc_xprt *xprt = ep->rep_xprt; 698 struct rpcrdma_xprt *r_xprt =
699 container_of(ep, struct rpcrdma_xprt, rx_ep);
700 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
695 701
696 spin_lock_bh(&xprt->transport_lock); 702 spin_lock_bh(&xprt->transport_lock);
697 if (++xprt->connect_cookie == 0) /* maintain a reserved value */ 703 if (++xprt->connect_cookie == 0) /* maintain a reserved value */
@@ -732,7 +738,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
732 struct rpc_xprt *xprt = rep->rr_xprt; 738 struct rpc_xprt *xprt = rep->rr_xprt;
733 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 739 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
734 __be32 *iptr; 740 __be32 *iptr;
735 int rdmalen, status; 741 int credits, rdmalen, status;
736 unsigned long cwnd; 742 unsigned long cwnd;
737 743
738 /* Check status. If bad, signal disconnect and return rep to pool */ 744 /* Check status. If bad, signal disconnect and return rep to pool */
@@ -744,14 +750,14 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
744 } 750 }
745 return; 751 return;
746 } 752 }
747 if (rep->rr_len < 28) { 753 if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
748 dprintk("RPC: %s: short/invalid reply\n", __func__); 754 dprintk("RPC: %s: short/invalid reply\n", __func__);
749 goto repost; 755 goto repost;
750 } 756 }
751 headerp = (struct rpcrdma_msg *) rep->rr_base; 757 headerp = rdmab_to_msg(rep->rr_rdmabuf);
752 if (headerp->rm_vers != xdr_one) { 758 if (headerp->rm_vers != rpcrdma_version) {
753 dprintk("RPC: %s: invalid version %d\n", 759 dprintk("RPC: %s: invalid version %d\n",
754 __func__, ntohl(headerp->rm_vers)); 760 __func__, be32_to_cpu(headerp->rm_vers));
755 goto repost; 761 goto repost;
756 } 762 }
757 763
@@ -762,7 +768,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
762 spin_unlock(&xprt->transport_lock); 768 spin_unlock(&xprt->transport_lock);
763 dprintk("RPC: %s: reply 0x%p failed " 769 dprintk("RPC: %s: reply 0x%p failed "
764 "to match any request xid 0x%08x len %d\n", 770 "to match any request xid 0x%08x len %d\n",
765 __func__, rep, headerp->rm_xid, rep->rr_len); 771 __func__, rep, be32_to_cpu(headerp->rm_xid),
772 rep->rr_len);
766repost: 773repost:
767 r_xprt->rx_stats.bad_reply_count++; 774 r_xprt->rx_stats.bad_reply_count++;
768 rep->rr_func = rpcrdma_reply_handler; 775 rep->rr_func = rpcrdma_reply_handler;
@@ -778,13 +785,14 @@ repost:
778 spin_unlock(&xprt->transport_lock); 785 spin_unlock(&xprt->transport_lock);
779 dprintk("RPC: %s: duplicate reply 0x%p to RPC " 786 dprintk("RPC: %s: duplicate reply 0x%p to RPC "
780 "request 0x%p: xid 0x%08x\n", __func__, rep, req, 787 "request 0x%p: xid 0x%08x\n", __func__, rep, req,
781 headerp->rm_xid); 788 be32_to_cpu(headerp->rm_xid));
782 goto repost; 789 goto repost;
783 } 790 }
784 791
785 dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" 792 dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
786 " RPC request 0x%p xid 0x%08x\n", 793 " RPC request 0x%p xid 0x%08x\n",
787 __func__, rep, req, rqst, headerp->rm_xid); 794 __func__, rep, req, rqst,
795 be32_to_cpu(headerp->rm_xid));
788 796
789 /* from here on, the reply is no longer an orphan */ 797 /* from here on, the reply is no longer an orphan */
790 req->rl_reply = rep; 798 req->rl_reply = rep;
@@ -793,7 +801,7 @@ repost:
793 /* check for expected message types */ 801 /* check for expected message types */
794 /* The order of some of these tests is important. */ 802 /* The order of some of these tests is important. */
795 switch (headerp->rm_type) { 803 switch (headerp->rm_type) {
796 case htonl(RDMA_MSG): 804 case rdma_msg:
797 /* never expect read chunks */ 805 /* never expect read chunks */
798 /* never expect reply chunks (two ways to check) */ 806 /* never expect reply chunks (two ways to check) */
799 /* never expect write chunks without having offered RDMA */ 807 /* never expect write chunks without having offered RDMA */
@@ -824,22 +832,24 @@ repost:
824 } else { 832 } else {
825 /* else ordinary inline */ 833 /* else ordinary inline */
826 rdmalen = 0; 834 rdmalen = 0;
827 iptr = (__be32 *)((unsigned char *)headerp + 28); 835 iptr = (__be32 *)((unsigned char *)headerp +
828 rep->rr_len -= 28; /*sizeof *headerp;*/ 836 RPCRDMA_HDRLEN_MIN);
837 rep->rr_len -= RPCRDMA_HDRLEN_MIN;
829 status = rep->rr_len; 838 status = rep->rr_len;
830 } 839 }
831 /* Fix up the rpc results for upper layer */ 840 /* Fix up the rpc results for upper layer */
832 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); 841 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
833 break; 842 break;
834 843
835 case htonl(RDMA_NOMSG): 844 case rdma_nomsg:
836 /* never expect read or write chunks, always reply chunks */ 845 /* never expect read or write chunks, always reply chunks */
837 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 846 if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
838 headerp->rm_body.rm_chunks[1] != xdr_zero || 847 headerp->rm_body.rm_chunks[1] != xdr_zero ||
839 headerp->rm_body.rm_chunks[2] != xdr_one || 848 headerp->rm_body.rm_chunks[2] != xdr_one ||
840 req->rl_nchunks == 0) 849 req->rl_nchunks == 0)
841 goto badheader; 850 goto badheader;
842 iptr = (__be32 *)((unsigned char *)headerp + 28); 851 iptr = (__be32 *)((unsigned char *)headerp +
852 RPCRDMA_HDRLEN_MIN);
843 rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); 853 rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
844 if (rdmalen < 0) 854 if (rdmalen < 0)
845 goto badheader; 855 goto badheader;
@@ -853,7 +863,7 @@ badheader:
853 dprintk("%s: invalid rpcrdma reply header (type %d):" 863 dprintk("%s: invalid rpcrdma reply header (type %d):"
854 " chunks[012] == %d %d %d" 864 " chunks[012] == %d %d %d"
855 " expected chunks <= %d\n", 865 " expected chunks <= %d\n",
856 __func__, ntohl(headerp->rm_type), 866 __func__, be32_to_cpu(headerp->rm_type),
857 headerp->rm_body.rm_chunks[0], 867 headerp->rm_body.rm_chunks[0],
858 headerp->rm_body.rm_chunks[1], 868 headerp->rm_body.rm_chunks[1],
859 headerp->rm_body.rm_chunks[2], 869 headerp->rm_body.rm_chunks[2],
@@ -863,8 +873,14 @@ badheader:
863 break; 873 break;
864 } 874 }
865 875
876 credits = be32_to_cpu(headerp->rm_credit);
877 if (credits == 0)
878 credits = 1; /* don't deadlock */
879 else if (credits > r_xprt->rx_buf.rb_max_requests)
880 credits = r_xprt->rx_buf.rb_max_requests;
881
866 cwnd = xprt->cwnd; 882 cwnd = xprt->cwnd;
867 xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; 883 xprt->cwnd = credits << RPC_CWNDSHIFT;
868 if (xprt->cwnd > cwnd) 884 if (xprt->cwnd > cwnd)
869 xprt_release_rqst_cong(rqst->rq_task); 885 xprt_release_rqst_cong(rqst->rq_task);
870 886
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index bbd6155d3e34..2e192baa59f3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -200,9 +200,9 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt)
200static void 200static void
201xprt_rdma_connect_worker(struct work_struct *work) 201xprt_rdma_connect_worker(struct work_struct *work)
202{ 202{
203 struct rpcrdma_xprt *r_xprt = 203 struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
204 container_of(work, struct rpcrdma_xprt, rdma_connect.work); 204 rx_connect_worker.work);
205 struct rpc_xprt *xprt = &r_xprt->xprt; 205 struct rpc_xprt *xprt = &r_xprt->rx_xprt;
206 int rc = 0; 206 int rc = 0;
207 207
208 xprt_clear_connected(xprt); 208 xprt_clear_connected(xprt);
@@ -235,7 +235,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
235 235
236 dprintk("RPC: %s: called\n", __func__); 236 dprintk("RPC: %s: called\n", __func__);
237 237
238 cancel_delayed_work_sync(&r_xprt->rdma_connect); 238 cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
239 239
240 xprt_clear_connected(xprt); 240 xprt_clear_connected(xprt);
241 241
@@ -364,8 +364,7 @@ xprt_setup_rdma(struct xprt_create *args)
364 * any inline data. Also specify any padding which will be provided 364 * any inline data. Also specify any padding which will be provided
365 * from a preregistered zero buffer. 365 * from a preregistered zero buffer.
366 */ 366 */
367 rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia, 367 rc = rpcrdma_buffer_create(new_xprt);
368 &new_xprt->rx_data);
369 if (rc) 368 if (rc)
370 goto out3; 369 goto out3;
371 370
@@ -374,9 +373,8 @@ xprt_setup_rdma(struct xprt_create *args)
374 * connection loss notification is async. We also catch connection loss 373 * connection loss notification is async. We also catch connection loss
375 * when reaping receives. 374 * when reaping receives.
376 */ 375 */
377 INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker); 376 INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
378 new_ep->rep_func = rpcrdma_conn_func; 377 xprt_rdma_connect_worker);
379 new_ep->rep_xprt = xprt;
380 378
381 xprt_rdma_format_addresses(xprt); 379 xprt_rdma_format_addresses(xprt);
382 xprt->max_payload = rpcrdma_max_payload(new_xprt); 380 xprt->max_payload = rpcrdma_max_payload(new_xprt);
@@ -434,94 +432,101 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
434 432
435 if (r_xprt->rx_ep.rep_connected != 0) { 433 if (r_xprt->rx_ep.rep_connected != 0) {
436 /* Reconnect */ 434 /* Reconnect */
437 schedule_delayed_work(&r_xprt->rdma_connect, 435 schedule_delayed_work(&r_xprt->rx_connect_worker,
438 xprt->reestablish_timeout); 436 xprt->reestablish_timeout);
439 xprt->reestablish_timeout <<= 1; 437 xprt->reestablish_timeout <<= 1;
440 if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) 438 if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
441 xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; 439 xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
442 else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) 440 else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
443 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; 441 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
444 } else { 442 } else {
445 schedule_delayed_work(&r_xprt->rdma_connect, 0); 443 schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
446 if (!RPC_IS_ASYNC(task)) 444 if (!RPC_IS_ASYNC(task))
447 flush_delayed_work(&r_xprt->rdma_connect); 445 flush_delayed_work(&r_xprt->rx_connect_worker);
448 } 446 }
449} 447}
450 448
451/* 449/*
452 * The RDMA allocate/free functions need the task structure as a place 450 * The RDMA allocate/free functions need the task structure as a place
453 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv 451 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
454 * sequence. For this reason, the recv buffers are attached to send 452 * sequence.
455 * buffers for portions of the RPC. Note that the RPC layer allocates 453 *
456 * both send and receive buffers in the same call. We may register 454 * The RPC layer allocates both send and receive buffers in the same call
457 * the receive buffer portion when using reply chunks. 455 * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
456 * We may register rq_rcv_buf when using reply chunks.
458 */ 457 */
459static void * 458static void *
460xprt_rdma_allocate(struct rpc_task *task, size_t size) 459xprt_rdma_allocate(struct rpc_task *task, size_t size)
461{ 460{
462 struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; 461 struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
463 struct rpcrdma_req *req, *nreq; 462 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
463 struct rpcrdma_regbuf *rb;
464 struct rpcrdma_req *req;
465 size_t min_size;
466 gfp_t flags;
464 467
465 req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); 468 req = rpcrdma_buffer_get(&r_xprt->rx_buf);
466 if (req == NULL) 469 if (req == NULL)
467 return NULL; 470 return NULL;
468 471
469 if (size > req->rl_size) { 472 flags = GFP_NOIO | __GFP_NOWARN;
470 dprintk("RPC: %s: size %zd too large for buffer[%zd]: " 473 if (RPC_IS_SWAPPER(task))
471 "prog %d vers %d proc %d\n", 474 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
472 __func__, size, req->rl_size, 475
473 task->tk_client->cl_prog, task->tk_client->cl_vers, 476 if (req->rl_rdmabuf == NULL)
474 task->tk_msg.rpc_proc->p_proc); 477 goto out_rdmabuf;
475 /* 478 if (req->rl_sendbuf == NULL)
476 * Outgoing length shortage. Our inline write max must have 479 goto out_sendbuf;
477 * been configured to perform direct i/o. 480 if (size > req->rl_sendbuf->rg_size)
478 * 481 goto out_sendbuf;
479 * This is therefore a large metadata operation, and the 482
480 * allocate call was made on the maximum possible message, 483out:
481 * e.g. containing long filename(s) or symlink data. In
482 * fact, while these metadata operations *might* carry
483 * large outgoing payloads, they rarely *do*. However, we
484 * have to commit to the request here, so reallocate and
485 * register it now. The data path will never require this
486 * reallocation.
487 *
488 * If the allocation or registration fails, the RPC framework
489 * will (doggedly) retry.
490 */
491 if (task->tk_flags & RPC_TASK_SWAPPER)
492 nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
493 else
494 nreq = kmalloc(sizeof *req + size, GFP_NOFS);
495 if (nreq == NULL)
496 goto outfail;
497
498 if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
499 nreq->rl_base, size + sizeof(struct rpcrdma_req)
500 - offsetof(struct rpcrdma_req, rl_base),
501 &nreq->rl_handle, &nreq->rl_iov)) {
502 kfree(nreq);
503 goto outfail;
504 }
505 rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
506 nreq->rl_size = size;
507 nreq->rl_niovs = 0;
508 nreq->rl_nchunks = 0;
509 nreq->rl_buffer = (struct rpcrdma_buffer *)req;
510 nreq->rl_reply = req->rl_reply;
511 memcpy(nreq->rl_segments,
512 req->rl_segments, sizeof nreq->rl_segments);
513 /* flag the swap with an unused field */
514 nreq->rl_iov.length = 0;
515 req->rl_reply = NULL;
516 req = nreq;
517 }
518 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); 484 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
519 req->rl_connect_cookie = 0; /* our reserved value */ 485 req->rl_connect_cookie = 0; /* our reserved value */
520 return req->rl_xdr_buf; 486 return req->rl_sendbuf->rg_base;
521 487
522outfail: 488out_rdmabuf:
489 min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
490 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
491 if (IS_ERR(rb))
492 goto out_fail;
493 req->rl_rdmabuf = rb;
494
495out_sendbuf:
496 /* XDR encoding and RPC/RDMA marshaling of this request has not
497 * yet occurred. Thus a lower bound is needed to prevent buffer
498 * overrun during marshaling.
499 *
500 * RPC/RDMA marshaling may choose to send payload bearing ops
501 * inline, if the result is smaller than the inline threshold.
502 * The value of the "size" argument accounts for header
503 * requirements but not for the payload in these cases.
504 *
505 * Likewise, allocate enough space to receive a reply up to the
506 * size of the inline threshold.
507 *
508 * It's unlikely that both the send header and the received
509 * reply will be large, but slush is provided here to allow
510 * flexibility when marshaling.
511 */
512 min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
513 min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
514 if (size < min_size)
515 size = min_size;
516
517 rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
518 if (IS_ERR(rb))
519 goto out_fail;
520 rb->rg_owner = req;
521
522 r_xprt->rx_stats.hardway_register_count += size;
523 rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
524 req->rl_sendbuf = rb;
525 goto out;
526
527out_fail:
523 rpcrdma_buffer_put(req); 528 rpcrdma_buffer_put(req);
524 rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; 529 r_xprt->rx_stats.failed_marshal_count++;
525 return NULL; 530 return NULL;
526} 531}
527 532
@@ -533,47 +538,24 @@ xprt_rdma_free(void *buffer)
533{ 538{
534 struct rpcrdma_req *req; 539 struct rpcrdma_req *req;
535 struct rpcrdma_xprt *r_xprt; 540 struct rpcrdma_xprt *r_xprt;
536 struct rpcrdma_rep *rep; 541 struct rpcrdma_regbuf *rb;
537 int i; 542 int i;
538 543
539 if (buffer == NULL) 544 if (buffer == NULL)
540 return; 545 return;
541 546
542 req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]); 547 rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
543 if (req->rl_iov.length == 0) { /* see allocate above */ 548 req = rb->rg_owner;
544 r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer, 549 r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
545 struct rpcrdma_xprt, rx_buf);
546 } else
547 r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
548 rep = req->rl_reply;
549 550
550 dprintk("RPC: %s: called on 0x%p%s\n", 551 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
551 __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
552 552
553 /*
554 * Finish the deregistration. The process is considered
555 * complete when the rr_func vector becomes NULL - this
556 * was put in place during rpcrdma_reply_handler() - the wait
557 * call below will not block if the dereg is "done". If
558 * interrupted, our framework will clean up.
559 */
560 for (i = 0; req->rl_nchunks;) { 553 for (i = 0; req->rl_nchunks;) {
561 --req->rl_nchunks; 554 --req->rl_nchunks;
562 i += rpcrdma_deregister_external( 555 i += rpcrdma_deregister_external(
563 &req->rl_segments[i], r_xprt); 556 &req->rl_segments[i], r_xprt);
564 } 557 }
565 558
566 if (req->rl_iov.length == 0) { /* see allocate above */
567 struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
568 oreq->rl_reply = req->rl_reply;
569 (void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
570 req->rl_handle,
571 &req->rl_iov);
572 kfree(req);
573 req = oreq;
574 }
575
576 /* Put back request+reply buffers */
577 rpcrdma_buffer_put(req); 559 rpcrdma_buffer_put(req);
578} 560}
579 561
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index c98e40643910..124676c13780 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -49,6 +49,7 @@
49 49
50#include <linux/interrupt.h> 50#include <linux/interrupt.h>
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/prefetch.h>
52#include <asm/bitops.h> 53#include <asm/bitops.h>
53 54
54#include "xprt_rdma.h" 55#include "xprt_rdma.h"
@@ -153,7 +154,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
153 event->device->name, context); 154 event->device->name, context);
154 if (ep->rep_connected == 1) { 155 if (ep->rep_connected == 1) {
155 ep->rep_connected = -EIO; 156 ep->rep_connected = -EIO;
156 ep->rep_func(ep); 157 rpcrdma_conn_func(ep);
157 wake_up_all(&ep->rep_connect_wait); 158 wake_up_all(&ep->rep_connect_wait);
158 } 159 }
159} 160}
@@ -168,23 +169,59 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
168 event->device->name, context); 169 event->device->name, context);
169 if (ep->rep_connected == 1) { 170 if (ep->rep_connected == 1) {
170 ep->rep_connected = -EIO; 171 ep->rep_connected = -EIO;
171 ep->rep_func(ep); 172 rpcrdma_conn_func(ep);
172 wake_up_all(&ep->rep_connect_wait); 173 wake_up_all(&ep->rep_connect_wait);
173 } 174 }
174} 175}
175 176
177static const char * const wc_status[] = {
178 "success",
179 "local length error",
180 "local QP operation error",
181 "local EE context operation error",
182 "local protection error",
183 "WR flushed",
184 "memory management operation error",
185 "bad response error",
186 "local access error",
187 "remote invalid request error",
188 "remote access error",
189 "remote operation error",
190 "transport retry counter exceeded",
191 "RNR retrycounter exceeded",
192 "local RDD violation error",
193 "remove invalid RD request",
194 "operation aborted",
195 "invalid EE context number",
196 "invalid EE context state",
197 "fatal error",
198 "response timeout error",
199 "general error",
200};
201
202#define COMPLETION_MSG(status) \
203 ((status) < ARRAY_SIZE(wc_status) ? \
204 wc_status[(status)] : "unexpected completion error")
205
176static void 206static void
177rpcrdma_sendcq_process_wc(struct ib_wc *wc) 207rpcrdma_sendcq_process_wc(struct ib_wc *wc)
178{ 208{
179 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 209 if (likely(wc->status == IB_WC_SUCCESS))
210 return;
180 211
181 dprintk("RPC: %s: frmr %p status %X opcode %d\n", 212 /* WARNING: Only wr_id and status are reliable at this point */
182 __func__, frmr, wc->status, wc->opcode); 213 if (wc->wr_id == 0ULL) {
214 if (wc->status != IB_WC_WR_FLUSH_ERR)
215 pr_err("RPC: %s: SEND: %s\n",
216 __func__, COMPLETION_MSG(wc->status));
217 } else {
218 struct rpcrdma_mw *r;
183 219
184 if (wc->wr_id == 0ULL) 220 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
185 return; 221 r->r.frmr.fr_state = FRMR_IS_STALE;
186 if (wc->status != IB_WC_SUCCESS) 222 pr_err("RPC: %s: frmr %p (stale): %s\n",
187 frmr->r.frmr.fr_state = FRMR_IS_STALE; 223 __func__, r, COMPLETION_MSG(wc->status));
224 }
188} 225}
189 226
190static int 227static int
@@ -248,33 +285,32 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
248 struct rpcrdma_rep *rep = 285 struct rpcrdma_rep *rep =
249 (struct rpcrdma_rep *)(unsigned long)wc->wr_id; 286 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
250 287
251 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", 288 /* WARNING: Only wr_id and status are reliable at this point */
252 __func__, rep, wc->status, wc->opcode, wc->byte_len); 289 if (wc->status != IB_WC_SUCCESS)
290 goto out_fail;
253 291
254 if (wc->status != IB_WC_SUCCESS) { 292 /* status == SUCCESS means all fields in wc are trustworthy */
255 rep->rr_len = ~0U;
256 goto out_schedule;
257 }
258 if (wc->opcode != IB_WC_RECV) 293 if (wc->opcode != IB_WC_RECV)
259 return; 294 return;
260 295
296 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
297 __func__, rep, wc->byte_len);
298
261 rep->rr_len = wc->byte_len; 299 rep->rr_len = wc->byte_len;
262 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, 300 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
263 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); 301 rdmab_addr(rep->rr_rdmabuf),
264 302 rep->rr_len, DMA_FROM_DEVICE);
265 if (rep->rr_len >= 16) { 303 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
266 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
267 unsigned int credits = ntohl(p->rm_credit);
268
269 if (credits == 0)
270 credits = 1; /* don't deadlock */
271 else if (credits > rep->rr_buffer->rb_max_requests)
272 credits = rep->rr_buffer->rb_max_requests;
273 atomic_set(&rep->rr_buffer->rb_credits, credits);
274 }
275 304
276out_schedule: 305out_schedule:
277 list_add_tail(&rep->rr_list, sched_list); 306 list_add_tail(&rep->rr_list, sched_list);
307 return;
308out_fail:
309 if (wc->status != IB_WC_WR_FLUSH_ERR)
310 pr_err("RPC: %s: rep %p: %s\n",
311 __func__, rep, COMPLETION_MSG(wc->status));
312 rep->rr_len = ~0U;
313 goto out_schedule;
278} 314}
279 315
280static int 316static int
@@ -390,8 +426,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
390#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 426#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
391 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 427 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
392#endif 428#endif
393 struct ib_qp_attr attr; 429 struct ib_qp_attr *attr = &ia->ri_qp_attr;
394 struct ib_qp_init_attr iattr; 430 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
395 int connstate = 0; 431 int connstate = 0;
396 432
397 switch (event->event) { 433 switch (event->event) {
@@ -414,12 +450,13 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
414 break; 450 break;
415 case RDMA_CM_EVENT_ESTABLISHED: 451 case RDMA_CM_EVENT_ESTABLISHED:
416 connstate = 1; 452 connstate = 1;
417 ib_query_qp(ia->ri_id->qp, &attr, 453 ib_query_qp(ia->ri_id->qp, attr,
418 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 454 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
419 &iattr); 455 iattr);
420 dprintk("RPC: %s: %d responder resources" 456 dprintk("RPC: %s: %d responder resources"
421 " (%d initiator)\n", 457 " (%d initiator)\n",
422 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); 458 __func__, attr->max_dest_rd_atomic,
459 attr->max_rd_atomic);
423 goto connected; 460 goto connected;
424 case RDMA_CM_EVENT_CONNECT_ERROR: 461 case RDMA_CM_EVENT_CONNECT_ERROR:
425 connstate = -ENOTCONN; 462 connstate = -ENOTCONN;
@@ -436,11 +473,10 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
436 case RDMA_CM_EVENT_DEVICE_REMOVAL: 473 case RDMA_CM_EVENT_DEVICE_REMOVAL:
437 connstate = -ENODEV; 474 connstate = -ENODEV;
438connected: 475connected:
439 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
440 dprintk("RPC: %s: %sconnected\n", 476 dprintk("RPC: %s: %sconnected\n",
441 __func__, connstate > 0 ? "" : "dis"); 477 __func__, connstate > 0 ? "" : "dis");
442 ep->rep_connected = connstate; 478 ep->rep_connected = connstate;
443 ep->rep_func(ep); 479 rpcrdma_conn_func(ep);
444 wake_up_all(&ep->rep_connect_wait); 480 wake_up_all(&ep->rep_connect_wait);
445 /*FALLTHROUGH*/ 481 /*FALLTHROUGH*/
446 default: 482 default:
@@ -453,7 +489,7 @@ connected:
453 489
454#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 490#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
455 if (connstate == 1) { 491 if (connstate == 1) {
456 int ird = attr.max_dest_rd_atomic; 492 int ird = attr->max_dest_rd_atomic;
457 int tird = ep->rep_remote_cma.responder_resources; 493 int tird = ep->rep_remote_cma.responder_resources;
458 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 494 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
459 "on %s, memreg %d slots %d ird %d%s\n", 495 "on %s, memreg %d slots %d ird %d%s\n",
@@ -554,8 +590,8 @@ int
554rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 590rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
555{ 591{
556 int rc, mem_priv; 592 int rc, mem_priv;
557 struct ib_device_attr devattr;
558 struct rpcrdma_ia *ia = &xprt->rx_ia; 593 struct rpcrdma_ia *ia = &xprt->rx_ia;
594 struct ib_device_attr *devattr = &ia->ri_devattr;
559 595
560 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 596 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
561 if (IS_ERR(ia->ri_id)) { 597 if (IS_ERR(ia->ri_id)) {
@@ -571,26 +607,21 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
571 goto out2; 607 goto out2;
572 } 608 }
573 609
574 /* 610 rc = ib_query_device(ia->ri_id->device, devattr);
575 * Query the device to determine if the requested memory
576 * registration strategy is supported. If it isn't, set the
577 * strategy to a globally supported model.
578 */
579 rc = ib_query_device(ia->ri_id->device, &devattr);
580 if (rc) { 611 if (rc) {
581 dprintk("RPC: %s: ib_query_device failed %d\n", 612 dprintk("RPC: %s: ib_query_device failed %d\n",
582 __func__, rc); 613 __func__, rc);
583 goto out2; 614 goto out3;
584 } 615 }
585 616
586 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 617 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
587 ia->ri_have_dma_lkey = 1; 618 ia->ri_have_dma_lkey = 1;
588 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; 619 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
589 } 620 }
590 621
591 if (memreg == RPCRDMA_FRMR) { 622 if (memreg == RPCRDMA_FRMR) {
592 /* Requires both frmr reg and local dma lkey */ 623 /* Requires both frmr reg and local dma lkey */
593 if ((devattr.device_cap_flags & 624 if ((devattr->device_cap_flags &
594 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 625 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
595 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 626 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
596 dprintk("RPC: %s: FRMR registration " 627 dprintk("RPC: %s: FRMR registration "
@@ -600,7 +631,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
600 /* Mind the ia limit on FRMR page list depth */ 631 /* Mind the ia limit on FRMR page list depth */
601 ia->ri_max_frmr_depth = min_t(unsigned int, 632 ia->ri_max_frmr_depth = min_t(unsigned int,
602 RPCRDMA_MAX_DATA_SEGS, 633 RPCRDMA_MAX_DATA_SEGS,
603 devattr.max_fast_reg_page_list_len); 634 devattr->max_fast_reg_page_list_len);
604 } 635 }
605 } 636 }
606 if (memreg == RPCRDMA_MTHCAFMR) { 637 if (memreg == RPCRDMA_MTHCAFMR) {
@@ -638,14 +669,14 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
638 "phys register failed with %lX\n", 669 "phys register failed with %lX\n",
639 __func__, PTR_ERR(ia->ri_bind_mem)); 670 __func__, PTR_ERR(ia->ri_bind_mem));
640 rc = -ENOMEM; 671 rc = -ENOMEM;
641 goto out2; 672 goto out3;
642 } 673 }
643 break; 674 break;
644 default: 675 default:
645 printk(KERN_ERR "RPC: Unsupported memory " 676 printk(KERN_ERR "RPC: Unsupported memory "
646 "registration mode: %d\n", memreg); 677 "registration mode: %d\n", memreg);
647 rc = -ENOMEM; 678 rc = -ENOMEM;
648 goto out2; 679 goto out3;
649 } 680 }
650 dprintk("RPC: %s: memory registration strategy is %d\n", 681 dprintk("RPC: %s: memory registration strategy is %d\n",
651 __func__, memreg); 682 __func__, memreg);
@@ -655,6 +686,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
655 686
656 rwlock_init(&ia->ri_qplock); 687 rwlock_init(&ia->ri_qplock);
657 return 0; 688 return 0;
689
690out3:
691 ib_dealloc_pd(ia->ri_pd);
692 ia->ri_pd = NULL;
658out2: 693out2:
659 rdma_destroy_id(ia->ri_id); 694 rdma_destroy_id(ia->ri_id);
660 ia->ri_id = NULL; 695 ia->ri_id = NULL;
@@ -698,20 +733,13 @@ int
698rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 733rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
699 struct rpcrdma_create_data_internal *cdata) 734 struct rpcrdma_create_data_internal *cdata)
700{ 735{
701 struct ib_device_attr devattr; 736 struct ib_device_attr *devattr = &ia->ri_devattr;
702 struct ib_cq *sendcq, *recvcq; 737 struct ib_cq *sendcq, *recvcq;
703 int rc, err; 738 int rc, err;
704 739
705 rc = ib_query_device(ia->ri_id->device, &devattr);
706 if (rc) {
707 dprintk("RPC: %s: ib_query_device failed %d\n",
708 __func__, rc);
709 return rc;
710 }
711
712 /* check provider's send/recv wr limits */ 740 /* check provider's send/recv wr limits */
713 if (cdata->max_requests > devattr.max_qp_wr) 741 if (cdata->max_requests > devattr->max_qp_wr)
714 cdata->max_requests = devattr.max_qp_wr; 742 cdata->max_requests = devattr->max_qp_wr;
715 743
716 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 744 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
717 ep->rep_attr.qp_context = ep; 745 ep->rep_attr.qp_context = ep;
@@ -746,8 +774,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
746 774
747 } 775 }
748 ep->rep_attr.cap.max_send_wr *= depth; 776 ep->rep_attr.cap.max_send_wr *= depth;
749 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { 777 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
750 cdata->max_requests = devattr.max_qp_wr / depth; 778 cdata->max_requests = devattr->max_qp_wr / depth;
751 if (!cdata->max_requests) 779 if (!cdata->max_requests)
752 return -EINVAL; 780 return -EINVAL;
753 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 781 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
@@ -766,6 +794,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
766 ep->rep_attr.qp_type = IB_QPT_RC; 794 ep->rep_attr.qp_type = IB_QPT_RC;
767 ep->rep_attr.port_num = ~0; 795 ep->rep_attr.port_num = ~0;
768 796
797 if (cdata->padding) {
798 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
799 GFP_KERNEL);
800 if (IS_ERR(ep->rep_padbuf))
801 return PTR_ERR(ep->rep_padbuf);
802 } else
803 ep->rep_padbuf = NULL;
804
769 dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 805 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
770 "iovs: send %d recv %d\n", 806 "iovs: send %d recv %d\n",
771 __func__, 807 __func__,
@@ -781,7 +817,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
781 else if (ep->rep_cqinit <= 2) 817 else if (ep->rep_cqinit <= 2)
782 ep->rep_cqinit = 0; 818 ep->rep_cqinit = 0;
783 INIT_CQCOUNT(ep); 819 INIT_CQCOUNT(ep);
784 ep->rep_ia = ia;
785 init_waitqueue_head(&ep->rep_connect_wait); 820 init_waitqueue_head(&ep->rep_connect_wait);
786 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); 821 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
787 822
@@ -831,10 +866,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
831 866
832 /* Client offers RDMA Read but does not initiate */ 867 /* Client offers RDMA Read but does not initiate */
833 ep->rep_remote_cma.initiator_depth = 0; 868 ep->rep_remote_cma.initiator_depth = 0;
834 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 869 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
835 ep->rep_remote_cma.responder_resources = 32; 870 ep->rep_remote_cma.responder_resources = 32;
836 else 871 else
837 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 872 ep->rep_remote_cma.responder_resources =
873 devattr->max_qp_rd_atom;
838 874
839 ep->rep_remote_cma.retry_count = 7; 875 ep->rep_remote_cma.retry_count = 7;
840 ep->rep_remote_cma.flow_control = 0; 876 ep->rep_remote_cma.flow_control = 0;
@@ -848,6 +884,7 @@ out2:
848 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 884 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
849 __func__, err); 885 __func__, err);
850out1: 886out1:
887 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
851 return rc; 888 return rc;
852} 889}
853 890
@@ -874,11 +911,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
874 ia->ri_id->qp = NULL; 911 ia->ri_id->qp = NULL;
875 } 912 }
876 913
877 /* padding - could be done in rpcrdma_buffer_destroy... */ 914 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
878 if (ep->rep_pad_mr) {
879 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
880 ep->rep_pad_mr = NULL;
881 }
882 915
883 rpcrdma_clean_cq(ep->rep_attr.recv_cq); 916 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
884 rc = ib_destroy_cq(ep->rep_attr.recv_cq); 917 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
@@ -1048,6 +1081,48 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1048 } 1081 }
1049} 1082}
1050 1083
1084static struct rpcrdma_req *
1085rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1086{
1087 struct rpcrdma_req *req;
1088
1089 req = kzalloc(sizeof(*req), GFP_KERNEL);
1090 if (req == NULL)
1091 return ERR_PTR(-ENOMEM);
1092
1093 req->rl_buffer = &r_xprt->rx_buf;
1094 return req;
1095}
1096
1097static struct rpcrdma_rep *
1098rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1099{
1100 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1101 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1102 struct rpcrdma_rep *rep;
1103 int rc;
1104
1105 rc = -ENOMEM;
1106 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1107 if (rep == NULL)
1108 goto out;
1109
1110 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1111 GFP_KERNEL);
1112 if (IS_ERR(rep->rr_rdmabuf)) {
1113 rc = PTR_ERR(rep->rr_rdmabuf);
1114 goto out_free;
1115 }
1116
1117 rep->rr_buffer = &r_xprt->rx_buf;
1118 return rep;
1119
1120out_free:
1121 kfree(rep);
1122out:
1123 return ERR_PTR(rc);
1124}
1125
1051static int 1126static int
1052rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) 1127rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1053{ 1128{
@@ -1134,27 +1209,26 @@ out_free:
1134} 1209}
1135 1210
1136int 1211int
1137rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, 1212rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1138 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1139{ 1213{
1214 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1215 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1216 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1140 char *p; 1217 char *p;
1141 size_t len, rlen, wlen; 1218 size_t len;
1142 int i, rc; 1219 int i, rc;
1143 1220
1144 buf->rb_max_requests = cdata->max_requests; 1221 buf->rb_max_requests = cdata->max_requests;
1145 spin_lock_init(&buf->rb_lock); 1222 spin_lock_init(&buf->rb_lock);
1146 atomic_set(&buf->rb_credits, 1);
1147 1223
1148 /* Need to allocate: 1224 /* Need to allocate:
1149 * 1. arrays for send and recv pointers 1225 * 1. arrays for send and recv pointers
1150 * 2. arrays of struct rpcrdma_req to fill in pointers 1226 * 2. arrays of struct rpcrdma_req to fill in pointers
1151 * 3. array of struct rpcrdma_rep for replies 1227 * 3. array of struct rpcrdma_rep for replies
1152 * 4. padding, if any
1153 * Send/recv buffers in req/rep need to be registered 1228 * Send/recv buffers in req/rep need to be registered
1154 */ 1229 */
1155 len = buf->rb_max_requests * 1230 len = buf->rb_max_requests *
1156 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 1231 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1157 len += cdata->padding;
1158 1232
1159 p = kzalloc(len, GFP_KERNEL); 1233 p = kzalloc(len, GFP_KERNEL);
1160 if (p == NULL) { 1234 if (p == NULL) {
@@ -1170,17 +1244,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1170 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1244 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1171 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1245 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1172 1246
1173 /*
1174 * Register the zeroed pad buffer, if any.
1175 */
1176 if (cdata->padding) {
1177 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1178 &ep->rep_pad_mr, &ep->rep_pad);
1179 if (rc)
1180 goto out;
1181 }
1182 p += cdata->padding;
1183
1184 INIT_LIST_HEAD(&buf->rb_mws); 1247 INIT_LIST_HEAD(&buf->rb_mws);
1185 INIT_LIST_HEAD(&buf->rb_all); 1248 INIT_LIST_HEAD(&buf->rb_all);
1186 switch (ia->ri_memreg_strategy) { 1249 switch (ia->ri_memreg_strategy) {
@@ -1198,62 +1261,29 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1198 break; 1261 break;
1199 } 1262 }
1200 1263
1201 /*
1202 * Allocate/init the request/reply buffers. Doing this
1203 * using kmalloc for now -- one for each buf.
1204 */
1205 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1206 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1207 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1208 __func__, wlen, rlen);
1209
1210 for (i = 0; i < buf->rb_max_requests; i++) { 1264 for (i = 0; i < buf->rb_max_requests; i++) {
1211 struct rpcrdma_req *req; 1265 struct rpcrdma_req *req;
1212 struct rpcrdma_rep *rep; 1266 struct rpcrdma_rep *rep;
1213 1267
1214 req = kmalloc(wlen, GFP_KERNEL); 1268 req = rpcrdma_create_req(r_xprt);
1215 if (req == NULL) { 1269 if (IS_ERR(req)) {
1216 dprintk("RPC: %s: request buffer %d alloc" 1270 dprintk("RPC: %s: request buffer %d alloc"
1217 " failed\n", __func__, i); 1271 " failed\n", __func__, i);
1218 rc = -ENOMEM; 1272 rc = PTR_ERR(req);
1219 goto out; 1273 goto out;
1220 } 1274 }
1221 memset(req, 0, sizeof(struct rpcrdma_req));
1222 buf->rb_send_bufs[i] = req; 1275 buf->rb_send_bufs[i] = req;
1223 buf->rb_send_bufs[i]->rl_buffer = buf;
1224 1276
1225 rc = rpcrdma_register_internal(ia, req->rl_base, 1277 rep = rpcrdma_create_rep(r_xprt);
1226 wlen - offsetof(struct rpcrdma_req, rl_base), 1278 if (IS_ERR(rep)) {
1227 &buf->rb_send_bufs[i]->rl_handle,
1228 &buf->rb_send_bufs[i]->rl_iov);
1229 if (rc)
1230 goto out;
1231
1232 buf->rb_send_bufs[i]->rl_size = wlen -
1233 sizeof(struct rpcrdma_req);
1234
1235 rep = kmalloc(rlen, GFP_KERNEL);
1236 if (rep == NULL) {
1237 dprintk("RPC: %s: reply buffer %d alloc failed\n", 1279 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1238 __func__, i); 1280 __func__, i);
1239 rc = -ENOMEM; 1281 rc = PTR_ERR(rep);
1240 goto out; 1282 goto out;
1241 } 1283 }
1242 memset(rep, 0, sizeof(struct rpcrdma_rep));
1243 buf->rb_recv_bufs[i] = rep; 1284 buf->rb_recv_bufs[i] = rep;
1244 buf->rb_recv_bufs[i]->rr_buffer = buf;
1245
1246 rc = rpcrdma_register_internal(ia, rep->rr_base,
1247 rlen - offsetof(struct rpcrdma_rep, rr_base),
1248 &buf->rb_recv_bufs[i]->rr_handle,
1249 &buf->rb_recv_bufs[i]->rr_iov);
1250 if (rc)
1251 goto out;
1252
1253 } 1285 }
1254 dprintk("RPC: %s: max_requests %d\n", 1286
1255 __func__, buf->rb_max_requests);
1256 /* done */
1257 return 0; 1287 return 0;
1258out: 1288out:
1259 rpcrdma_buffer_destroy(buf); 1289 rpcrdma_buffer_destroy(buf);
@@ -1261,6 +1291,27 @@ out:
1261} 1291}
1262 1292
1263static void 1293static void
1294rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1295{
1296 if (!rep)
1297 return;
1298
1299 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1300 kfree(rep);
1301}
1302
1303static void
1304rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1305{
1306 if (!req)
1307 return;
1308
1309 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
1310 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1311 kfree(req);
1312}
1313
1314static void
1264rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) 1315rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1265{ 1316{
1266 struct rpcrdma_mw *r; 1317 struct rpcrdma_mw *r;
@@ -1315,18 +1366,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1315 dprintk("RPC: %s: entering\n", __func__); 1366 dprintk("RPC: %s: entering\n", __func__);
1316 1367
1317 for (i = 0; i < buf->rb_max_requests; i++) { 1368 for (i = 0; i < buf->rb_max_requests; i++) {
1318 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { 1369 if (buf->rb_recv_bufs)
1319 rpcrdma_deregister_internal(ia, 1370 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1320 buf->rb_recv_bufs[i]->rr_handle, 1371 if (buf->rb_send_bufs)
1321 &buf->rb_recv_bufs[i]->rr_iov); 1372 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1322 kfree(buf->rb_recv_bufs[i]);
1323 }
1324 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1325 rpcrdma_deregister_internal(ia,
1326 buf->rb_send_bufs[i]->rl_handle,
1327 &buf->rb_send_bufs[i]->rl_iov);
1328 kfree(buf->rb_send_bufs[i]);
1329 }
1330 } 1373 }
1331 1374
1332 switch (ia->ri_memreg_strategy) { 1375 switch (ia->ri_memreg_strategy) {
@@ -1450,8 +1493,8 @@ rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1450 int i; 1493 int i;
1451 1494
1452 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) 1495 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1453 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); 1496 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1454 rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); 1497 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
1455} 1498}
1456 1499
1457static void 1500static void
@@ -1537,7 +1580,7 @@ rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1537 list_add(&r->mw_list, stale); 1580 list_add(&r->mw_list, stale);
1538 continue; 1581 continue;
1539 } 1582 }
1540 req->rl_segments[i].mr_chunk.rl_mw = r; 1583 req->rl_segments[i].rl_mw = r;
1541 if (unlikely(i-- == 0)) 1584 if (unlikely(i-- == 0))
1542 return req; /* Success */ 1585 return req; /* Success */
1543 } 1586 }
@@ -1559,7 +1602,7 @@ rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1559 r = list_entry(buf->rb_mws.next, 1602 r = list_entry(buf->rb_mws.next,
1560 struct rpcrdma_mw, mw_list); 1603 struct rpcrdma_mw, mw_list);
1561 list_del(&r->mw_list); 1604 list_del(&r->mw_list);
1562 req->rl_segments[i].mr_chunk.rl_mw = r; 1605 req->rl_segments[i].rl_mw = r;
1563 if (unlikely(i-- == 0)) 1606 if (unlikely(i-- == 0))
1564 return req; /* Success */ 1607 return req; /* Success */
1565 } 1608 }
@@ -1658,8 +1701,6 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1658 struct rpcrdma_buffer *buffers = req->rl_buffer; 1701 struct rpcrdma_buffer *buffers = req->rl_buffer;
1659 unsigned long flags; 1702 unsigned long flags;
1660 1703
1661 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1662 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1663 spin_lock_irqsave(&buffers->rb_lock, flags); 1704 spin_lock_irqsave(&buffers->rb_lock, flags);
1664 if (buffers->rb_recv_index < buffers->rb_max_requests) { 1705 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1665 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1706 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
@@ -1688,7 +1729,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1688 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1729 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1689 */ 1730 */
1690 1731
1691int 1732static int
1692rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1733rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1693 struct ib_mr **mrp, struct ib_sge *iov) 1734 struct ib_mr **mrp, struct ib_sge *iov)
1694{ 1735{
@@ -1739,7 +1780,7 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1739 return rc; 1780 return rc;
1740} 1781}
1741 1782
1742int 1783static int
1743rpcrdma_deregister_internal(struct rpcrdma_ia *ia, 1784rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1744 struct ib_mr *mr, struct ib_sge *iov) 1785 struct ib_mr *mr, struct ib_sge *iov)
1745{ 1786{
@@ -1757,6 +1798,61 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1757 return rc; 1798 return rc;
1758} 1799}
1759 1800
1801/**
1802 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1803 * @ia: controlling rpcrdma_ia
1804 * @size: size of buffer to be allocated, in bytes
1805 * @flags: GFP flags
1806 *
1807 * Returns pointer to private header of an area of internally
1808 * registered memory, or an ERR_PTR. The registered buffer follows
1809 * the end of the private header.
1810 *
1811 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1812 * receiving the payload of RDMA RECV operations. regbufs are not
1813 * used for RDMA READ/WRITE operations, thus are registered only for
1814 * LOCAL access.
1815 */
1816struct rpcrdma_regbuf *
1817rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1818{
1819 struct rpcrdma_regbuf *rb;
1820 int rc;
1821
1822 rc = -ENOMEM;
1823 rb = kmalloc(sizeof(*rb) + size, flags);
1824 if (rb == NULL)
1825 goto out;
1826
1827 rb->rg_size = size;
1828 rb->rg_owner = NULL;
1829 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1830 &rb->rg_mr, &rb->rg_iov);
1831 if (rc)
1832 goto out_free;
1833
1834 return rb;
1835
1836out_free:
1837 kfree(rb);
1838out:
1839 return ERR_PTR(rc);
1840}
1841
1842/**
1843 * rpcrdma_free_regbuf - deregister and free registered buffer
1844 * @ia: controlling rpcrdma_ia
1845 * @rb: regbuf to be deregistered and freed
1846 */
1847void
1848rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1849{
1850 if (rb) {
1851 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1852 kfree(rb);
1853 }
1854}
1855
1760/* 1856/*
1761 * Wrappers for chunk registration, shared by read/write chunk code. 1857 * Wrappers for chunk registration, shared by read/write chunk code.
1762 */ 1858 */
@@ -1799,7 +1895,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1799 struct rpcrdma_xprt *r_xprt) 1895 struct rpcrdma_xprt *r_xprt)
1800{ 1896{
1801 struct rpcrdma_mr_seg *seg1 = seg; 1897 struct rpcrdma_mr_seg *seg1 = seg;
1802 struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; 1898 struct rpcrdma_mw *mw = seg1->rl_mw;
1803 struct rpcrdma_frmr *frmr = &mw->r.frmr; 1899 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1804 struct ib_mr *mr = frmr->fr_mr; 1900 struct ib_mr *mr = frmr->fr_mr;
1805 struct ib_send_wr fastreg_wr, *bad_wr; 1901 struct ib_send_wr fastreg_wr, *bad_wr;
@@ -1888,12 +1984,12 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1888 struct ib_send_wr invalidate_wr, *bad_wr; 1984 struct ib_send_wr invalidate_wr, *bad_wr;
1889 int rc; 1985 int rc;
1890 1986
1891 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; 1987 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1892 1988
1893 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1989 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1894 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1990 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1895 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1991 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1896 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1992 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1897 DECR_CQCOUNT(&r_xprt->rx_ep); 1993 DECR_CQCOUNT(&r_xprt->rx_ep);
1898 1994
1899 read_lock(&ia->ri_qplock); 1995 read_lock(&ia->ri_qplock);
@@ -1903,7 +1999,7 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1903 read_unlock(&ia->ri_qplock); 1999 read_unlock(&ia->ri_qplock);
1904 if (rc) { 2000 if (rc) {
1905 /* Force rpcrdma_buffer_get() to retry */ 2001 /* Force rpcrdma_buffer_get() to retry */
1906 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; 2002 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
1907 dprintk("RPC: %s: failed ib_post_send for invalidate," 2003 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1908 " status %i\n", __func__, rc); 2004 " status %i\n", __func__, rc);
1909 } 2005 }
@@ -1935,8 +2031,7 @@ rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1935 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 2031 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1936 break; 2032 break;
1937 } 2033 }
1938 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, 2034 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
1939 physaddrs, i, seg1->mr_dma);
1940 if (rc) { 2035 if (rc) {
1941 dprintk("RPC: %s: failed ib_map_phys_fmr " 2036 dprintk("RPC: %s: failed ib_map_phys_fmr "
1942 "%u@0x%llx+%i (%d)... status %i\n", __func__, 2037 "%u@0x%llx+%i (%d)... status %i\n", __func__,
@@ -1945,7 +2040,7 @@ rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1945 while (i--) 2040 while (i--)
1946 rpcrdma_unmap_one(ia, --seg); 2041 rpcrdma_unmap_one(ia, --seg);
1947 } else { 2042 } else {
1948 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; 2043 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
1949 seg1->mr_base = seg1->mr_dma + pageoff; 2044 seg1->mr_base = seg1->mr_dma + pageoff;
1950 seg1->mr_nsegs = i; 2045 seg1->mr_nsegs = i;
1951 seg1->mr_len = len; 2046 seg1->mr_len = len;
@@ -1962,7 +2057,7 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1962 LIST_HEAD(l); 2057 LIST_HEAD(l);
1963 int rc; 2058 int rc;
1964 2059
1965 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); 2060 list_add(&seg1->rl_mw->r.fmr->list, &l);
1966 rc = ib_unmap_fmr(&l); 2061 rc = ib_unmap_fmr(&l);
1967 read_lock(&ia->ri_qplock); 2062 read_lock(&ia->ri_qplock);
1968 while (seg1->mr_nsegs--) 2063 while (seg1->mr_nsegs--)
@@ -2104,11 +2199,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2104 2199
2105 recv_wr.next = NULL; 2200 recv_wr.next = NULL;
2106 recv_wr.wr_id = (u64) (unsigned long) rep; 2201 recv_wr.wr_id = (u64) (unsigned long) rep;
2107 recv_wr.sg_list = &rep->rr_iov; 2202 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
2108 recv_wr.num_sge = 1; 2203 recv_wr.num_sge = 1;
2109 2204
2110 ib_dma_sync_single_for_cpu(ia->ri_id->device, 2205 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2111 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); 2206 rdmab_addr(rep->rr_rdmabuf),
2207 rdmab_length(rep->rr_rdmabuf),
2208 DMA_BIDIRECTIONAL);
2112 2209
2113 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 2210 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2114 2211
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b799041b75bf..d1b70397c60f 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,6 +70,9 @@ struct rpcrdma_ia {
70 int ri_async_rc; 70 int ri_async_rc;
71 enum rpcrdma_memreg ri_memreg_strategy; 71 enum rpcrdma_memreg ri_memreg_strategy;
72 unsigned int ri_max_frmr_depth; 72 unsigned int ri_max_frmr_depth;
73 struct ib_device_attr ri_devattr;
74 struct ib_qp_attr ri_qp_attr;
75 struct ib_qp_init_attr ri_qp_init_attr;
73}; 76};
74 77
75/* 78/*
@@ -83,13 +86,9 @@ struct rpcrdma_ep {
83 atomic_t rep_cqcount; 86 atomic_t rep_cqcount;
84 int rep_cqinit; 87 int rep_cqinit;
85 int rep_connected; 88 int rep_connected;
86 struct rpcrdma_ia *rep_ia;
87 struct ib_qp_init_attr rep_attr; 89 struct ib_qp_init_attr rep_attr;
88 wait_queue_head_t rep_connect_wait; 90 wait_queue_head_t rep_connect_wait;
89 struct ib_sge rep_pad; /* holds zeroed pad */ 91 struct rpcrdma_regbuf *rep_padbuf;
90 struct ib_mr *rep_pad_mr; /* holds zeroed pad */
91 void (*rep_func)(struct rpcrdma_ep *);
92 struct rpc_xprt *rep_xprt; /* for rep_func */
93 struct rdma_conn_param rep_remote_cma; 92 struct rdma_conn_param rep_remote_cma;
94 struct sockaddr_storage rep_remote_addr; 93 struct sockaddr_storage rep_remote_addr;
95 struct delayed_work rep_connect_worker; 94 struct delayed_work rep_connect_worker;
@@ -106,6 +105,44 @@ struct rpcrdma_ep {
106#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 105#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
107#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 106#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
108 107
108/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
109 *
110 * The below structure appears at the front of a large region of kmalloc'd
111 * memory, which always starts on a good alignment boundary.
112 */
113
114struct rpcrdma_regbuf {
115 size_t rg_size;
116 struct rpcrdma_req *rg_owner;
117 struct ib_mr *rg_mr;
118 struct ib_sge rg_iov;
119 __be32 rg_base[0] __attribute__ ((aligned(256)));
120};
121
122static inline u64
123rdmab_addr(struct rpcrdma_regbuf *rb)
124{
125 return rb->rg_iov.addr;
126}
127
128static inline u32
129rdmab_length(struct rpcrdma_regbuf *rb)
130{
131 return rb->rg_iov.length;
132}
133
134static inline u32
135rdmab_lkey(struct rpcrdma_regbuf *rb)
136{
137 return rb->rg_iov.lkey;
138}
139
140static inline struct rpcrdma_msg *
141rdmab_to_msg(struct rpcrdma_regbuf *rb)
142{
143 return (struct rpcrdma_msg *)rb->rg_base;
144}
145
109enum rpcrdma_chunktype { 146enum rpcrdma_chunktype {
110 rpcrdma_noch = 0, 147 rpcrdma_noch = 0,
111 rpcrdma_readch, 148 rpcrdma_readch,
@@ -134,22 +171,16 @@ enum rpcrdma_chunktype {
134/* temporary static scatter/gather max */ 171/* temporary static scatter/gather max */
135#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ 172#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
136#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ 173#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
137#define MAX_RPCRDMAHDR (\
138 /* max supported RPC/RDMA header */ \
139 sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
140 (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
141 174
142struct rpcrdma_buffer; 175struct rpcrdma_buffer;
143 176
144struct rpcrdma_rep { 177struct rpcrdma_rep {
145 unsigned int rr_len; /* actual received reply length */ 178 unsigned int rr_len;
146 struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ 179 struct rpcrdma_buffer *rr_buffer;
147 struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ 180 struct rpc_xprt *rr_xprt;
148 void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ 181 void (*rr_func)(struct rpcrdma_rep *);
149 struct list_head rr_list; /* tasklet list */ 182 struct list_head rr_list;
150 struct ib_sge rr_iov; /* for posting */ 183 struct rpcrdma_regbuf *rr_rdmabuf;
151 struct ib_mr *rr_handle; /* handle for mem in rr_iov */
152 char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
153}; 184};
154 185
155/* 186/*
@@ -211,10 +242,7 @@ struct rpcrdma_mw {
211 */ 242 */
212 243
213struct rpcrdma_mr_seg { /* chunk descriptors */ 244struct rpcrdma_mr_seg { /* chunk descriptors */
214 union { /* chunk memory handles */ 245 struct rpcrdma_mw *rl_mw; /* registered MR */
215 struct ib_mr *rl_mr; /* if registered directly */
216 struct rpcrdma_mw *rl_mw; /* if registered from region */
217 } mr_chunk;
218 u64 mr_base; /* registration result */ 246 u64 mr_base; /* registration result */
219 u32 mr_rkey; /* registration result */ 247 u32 mr_rkey; /* registration result */
220 u32 mr_len; /* length of chunk or segment */ 248 u32 mr_len; /* length of chunk or segment */
@@ -227,22 +255,27 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
227}; 255};
228 256
229struct rpcrdma_req { 257struct rpcrdma_req {
230 size_t rl_size; /* actual length of buffer */
231 unsigned int rl_niovs; /* 0, 2 or 4 */ 258 unsigned int rl_niovs; /* 0, 2 or 4 */
232 unsigned int rl_nchunks; /* non-zero if chunks */ 259 unsigned int rl_nchunks; /* non-zero if chunks */
233 unsigned int rl_connect_cookie; /* retry detection */ 260 unsigned int rl_connect_cookie; /* retry detection */
234 enum rpcrdma_chunktype rl_rtype, rl_wtype; 261 enum rpcrdma_chunktype rl_rtype, rl_wtype;
235 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 262 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
236 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 263 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
237 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
238 struct ib_sge rl_send_iov[4]; /* for active requests */ 264 struct ib_sge rl_send_iov[4]; /* for active requests */
239 struct ib_sge rl_iov; /* for posting */ 265 struct rpcrdma_regbuf *rl_rdmabuf;
240 struct ib_mr *rl_handle; /* handle for mem in rl_iov */ 266 struct rpcrdma_regbuf *rl_sendbuf;
241 char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ 267 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
242 __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */
243}; 268};
244#define rpcr_to_rdmar(r) \ 269
245 container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) 270static inline struct rpcrdma_req *
271rpcr_to_rdmar(struct rpc_rqst *rqst)
272{
273 void *buffer = rqst->rq_buffer;
274 struct rpcrdma_regbuf *rb;
275
276 rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
277 return rb->rg_owner;
278}
246 279
247/* 280/*
248 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for 281 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
@@ -252,7 +285,6 @@ struct rpcrdma_req {
252 */ 285 */
253struct rpcrdma_buffer { 286struct rpcrdma_buffer {
254 spinlock_t rb_lock; /* protects indexes */ 287 spinlock_t rb_lock; /* protects indexes */
255 atomic_t rb_credits; /* most recent server credits */
256 int rb_max_requests;/* client max requests */ 288 int rb_max_requests;/* client max requests */
257 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ 289 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
258 struct list_head rb_all; 290 struct list_head rb_all;
@@ -318,16 +350,16 @@ struct rpcrdma_stats {
318 * during unmount. 350 * during unmount.
319 */ 351 */
320struct rpcrdma_xprt { 352struct rpcrdma_xprt {
321 struct rpc_xprt xprt; 353 struct rpc_xprt rx_xprt;
322 struct rpcrdma_ia rx_ia; 354 struct rpcrdma_ia rx_ia;
323 struct rpcrdma_ep rx_ep; 355 struct rpcrdma_ep rx_ep;
324 struct rpcrdma_buffer rx_buf; 356 struct rpcrdma_buffer rx_buf;
325 struct rpcrdma_create_data_internal rx_data; 357 struct rpcrdma_create_data_internal rx_data;
326 struct delayed_work rdma_connect; 358 struct delayed_work rx_connect_worker;
327 struct rpcrdma_stats rx_stats; 359 struct rpcrdma_stats rx_stats;
328}; 360};
329 361
330#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) 362#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
331#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) 363#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
332 364
333/* Setting this to 0 ensures interoperability with early servers. 365/* Setting this to 0 ensures interoperability with early servers.
@@ -358,9 +390,7 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
358/* 390/*
359 * Buffer calls - xprtrdma/verbs.c 391 * Buffer calls - xprtrdma/verbs.c
360 */ 392 */
361int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, 393int rpcrdma_buffer_create(struct rpcrdma_xprt *);
362 struct rpcrdma_ia *,
363 struct rpcrdma_create_data_internal *);
364void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 394void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
365 395
366struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); 396struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
@@ -368,16 +398,16 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
368void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 398void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
369void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 399void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
370 400
371int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
372 struct ib_mr **, struct ib_sge *);
373int rpcrdma_deregister_internal(struct rpcrdma_ia *,
374 struct ib_mr *, struct ib_sge *);
375
376int rpcrdma_register_external(struct rpcrdma_mr_seg *, 401int rpcrdma_register_external(struct rpcrdma_mr_seg *,
377 int, int, struct rpcrdma_xprt *); 402 int, int, struct rpcrdma_xprt *);
378int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, 403int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
379 struct rpcrdma_xprt *); 404 struct rpcrdma_xprt *);
380 405
406struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
407 size_t, gfp_t);
408void rpcrdma_free_regbuf(struct rpcrdma_ia *,
409 struct rpcrdma_regbuf *);
410
381/* 411/*
382 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 412 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
383 */ 413 */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 87ce7e8bb8dc..66891e32c5e3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -63,6 +63,8 @@ static unsigned int xprt_max_tcp_slot_table_entries = RPC_MAX_SLOT_TABLE;
63static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; 63static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
64static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; 64static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
65 65
66#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
67
66#define XS_TCP_LINGER_TO (15U * HZ) 68#define XS_TCP_LINGER_TO (15U * HZ)
67static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; 69static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
68 70
@@ -75,8 +77,6 @@ static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
75 * someone else's file names! 77 * someone else's file names!
76 */ 78 */
77 79
78#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
79
80static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; 80static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
81static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; 81static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
82static unsigned int max_tcp_slot_table_limit = RPC_MAX_SLOT_TABLE_LIMIT; 82static unsigned int max_tcp_slot_table_limit = RPC_MAX_SLOT_TABLE_LIMIT;
@@ -627,7 +627,7 @@ process_status:
627 * @xprt: transport 627 * @xprt: transport
628 * 628 *
629 * Initiates a graceful shutdown of the TCP socket by calling the 629 * Initiates a graceful shutdown of the TCP socket by calling the
630 * equivalent of shutdown(SHUT_WR); 630 * equivalent of shutdown(SHUT_RDWR);
631 */ 631 */
632static void xs_tcp_shutdown(struct rpc_xprt *xprt) 632static void xs_tcp_shutdown(struct rpc_xprt *xprt)
633{ 633{
@@ -635,7 +635,7 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
635 struct socket *sock = transport->sock; 635 struct socket *sock = transport->sock;
636 636
637 if (sock != NULL) { 637 if (sock != NULL) {
638 kernel_sock_shutdown(sock, SHUT_WR); 638 kernel_sock_shutdown(sock, SHUT_RDWR);
639 trace_rpc_socket_shutdown(xprt, sock); 639 trace_rpc_socket_shutdown(xprt, sock);
640 } 640 }
641} 641}
@@ -718,9 +718,9 @@ static int xs_tcp_send_request(struct rpc_task *task)
718 dprintk("RPC: sendmsg returned unrecognized error %d\n", 718 dprintk("RPC: sendmsg returned unrecognized error %d\n",
719 -status); 719 -status);
720 case -ECONNRESET: 720 case -ECONNRESET:
721 xs_tcp_shutdown(xprt);
722 case -ECONNREFUSED: 721 case -ECONNREFUSED:
723 case -ENOTCONN: 722 case -ENOTCONN:
723 case -EADDRINUSE:
724 case -EPIPE: 724 case -EPIPE:
725 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); 725 clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
726 } 726 }
@@ -773,6 +773,21 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
773 sk->sk_error_report = transport->old_error_report; 773 sk->sk_error_report = transport->old_error_report;
774} 774}
775 775
776static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
777{
778 smp_mb__before_atomic();
779 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
780 clear_bit(XPRT_CLOSING, &xprt->state);
781 smp_mb__after_atomic();
782}
783
784static void xs_sock_mark_closed(struct rpc_xprt *xprt)
785{
786 xs_sock_reset_connection_flags(xprt);
787 /* Mark transport as closed and wake up all pending tasks */
788 xprt_disconnect_done(xprt);
789}
790
776/** 791/**
777 * xs_error_report - callback to handle TCP socket state errors 792 * xs_error_report - callback to handle TCP socket state errors
778 * @sk: socket 793 * @sk: socket
@@ -792,11 +807,12 @@ static void xs_error_report(struct sock *sk)
792 err = -sk->sk_err; 807 err = -sk->sk_err;
793 if (err == 0) 808 if (err == 0)
794 goto out; 809 goto out;
810 /* Is this a reset event? */
811 if (sk->sk_state == TCP_CLOSE)
812 xs_sock_mark_closed(xprt);
795 dprintk("RPC: xs_error_report client %p, error=%d...\n", 813 dprintk("RPC: xs_error_report client %p, error=%d...\n",
796 xprt, -err); 814 xprt, -err);
797 trace_rpc_socket_error(xprt, sk->sk_socket, err); 815 trace_rpc_socket_error(xprt, sk->sk_socket, err);
798 if (test_bit(XPRT_CONNECTION_REUSE, &xprt->state))
799 goto out;
800 xprt_wake_pending_tasks(xprt, err); 816 xprt_wake_pending_tasks(xprt, err);
801 out: 817 out:
802 read_unlock_bh(&sk->sk_callback_lock); 818 read_unlock_bh(&sk->sk_callback_lock);
@@ -806,12 +822,11 @@ static void xs_reset_transport(struct sock_xprt *transport)
806{ 822{
807 struct socket *sock = transport->sock; 823 struct socket *sock = transport->sock;
808 struct sock *sk = transport->inet; 824 struct sock *sk = transport->inet;
825 struct rpc_xprt *xprt = &transport->xprt;
809 826
810 if (sk == NULL) 827 if (sk == NULL)
811 return; 828 return;
812 829
813 transport->srcport = 0;
814
815 write_lock_bh(&sk->sk_callback_lock); 830 write_lock_bh(&sk->sk_callback_lock);
816 transport->inet = NULL; 831 transport->inet = NULL;
817 transport->sock = NULL; 832 transport->sock = NULL;
@@ -820,8 +835,9 @@ static void xs_reset_transport(struct sock_xprt *transport)
820 835
821 xs_restore_old_callbacks(transport, sk); 836 xs_restore_old_callbacks(transport, sk);
822 write_unlock_bh(&sk->sk_callback_lock); 837 write_unlock_bh(&sk->sk_callback_lock);
838 xs_sock_reset_connection_flags(xprt);
823 839
824 trace_rpc_socket_close(&transport->xprt, sock); 840 trace_rpc_socket_close(xprt, sock);
825 sock_release(sock); 841 sock_release(sock);
826} 842}
827 843
@@ -841,27 +857,12 @@ static void xs_close(struct rpc_xprt *xprt)
841 857
842 dprintk("RPC: xs_close xprt %p\n", xprt); 858 dprintk("RPC: xs_close xprt %p\n", xprt);
843 859
844 cancel_delayed_work_sync(&transport->connect_worker);
845
846 xs_reset_transport(transport); 860 xs_reset_transport(transport);
847 xprt->reestablish_timeout = 0; 861 xprt->reestablish_timeout = 0;
848 862
849 smp_mb__before_atomic();
850 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
851 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
852 clear_bit(XPRT_CLOSING, &xprt->state);
853 smp_mb__after_atomic();
854 xprt_disconnect_done(xprt); 863 xprt_disconnect_done(xprt);
855} 864}
856 865
857static void xs_tcp_close(struct rpc_xprt *xprt)
858{
859 if (test_and_clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state))
860 xs_close(xprt);
861 else
862 xs_tcp_shutdown(xprt);
863}
864
865static void xs_xprt_free(struct rpc_xprt *xprt) 866static void xs_xprt_free(struct rpc_xprt *xprt)
866{ 867{
867 xs_free_peer_addresses(xprt); 868 xs_free_peer_addresses(xprt);
@@ -1032,7 +1033,6 @@ static void xs_udp_data_ready(struct sock *sk)
1032 */ 1033 */
1033static void xs_tcp_force_close(struct rpc_xprt *xprt) 1034static void xs_tcp_force_close(struct rpc_xprt *xprt)
1034{ 1035{
1035 set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
1036 xprt_force_disconnect(xprt); 1036 xprt_force_disconnect(xprt);
1037} 1037}
1038 1038
@@ -1425,54 +1425,6 @@ out:
1425 read_unlock_bh(&sk->sk_callback_lock); 1425 read_unlock_bh(&sk->sk_callback_lock);
1426} 1426}
1427 1427
1428/*
1429 * Do the equivalent of linger/linger2 handling for dealing with
1430 * broken servers that don't close the socket in a timely
1431 * fashion
1432 */
1433static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
1434 unsigned long timeout)
1435{
1436 struct sock_xprt *transport;
1437
1438 if (xprt_test_and_set_connecting(xprt))
1439 return;
1440 set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1441 transport = container_of(xprt, struct sock_xprt, xprt);
1442 queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
1443 timeout);
1444}
1445
1446static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
1447{
1448 struct sock_xprt *transport;
1449
1450 transport = container_of(xprt, struct sock_xprt, xprt);
1451
1452 if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
1453 !cancel_delayed_work(&transport->connect_worker))
1454 return;
1455 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1456 xprt_clear_connecting(xprt);
1457}
1458
1459static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
1460{
1461 smp_mb__before_atomic();
1462 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1463 clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
1464 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
1465 clear_bit(XPRT_CLOSING, &xprt->state);
1466 smp_mb__after_atomic();
1467}
1468
1469static void xs_sock_mark_closed(struct rpc_xprt *xprt)
1470{
1471 xs_sock_reset_connection_flags(xprt);
1472 /* Mark transport as closed and wake up all pending tasks */
1473 xprt_disconnect_done(xprt);
1474}
1475
1476/** 1428/**
1477 * xs_tcp_state_change - callback to handle TCP socket state changes 1429 * xs_tcp_state_change - callback to handle TCP socket state changes
1478 * @sk: socket whose state has changed 1430 * @sk: socket whose state has changed
@@ -1521,7 +1473,6 @@ static void xs_tcp_state_change(struct sock *sk)
1521 clear_bit(XPRT_CONNECTED, &xprt->state); 1473 clear_bit(XPRT_CONNECTED, &xprt->state);
1522 clear_bit(XPRT_CLOSE_WAIT, &xprt->state); 1474 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
1523 smp_mb__after_atomic(); 1475 smp_mb__after_atomic();
1524 xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
1525 break; 1476 break;
1526 case TCP_CLOSE_WAIT: 1477 case TCP_CLOSE_WAIT:
1527 /* The server initiated a shutdown of the socket */ 1478 /* The server initiated a shutdown of the socket */
@@ -1538,13 +1489,11 @@ static void xs_tcp_state_change(struct sock *sk)
1538 break; 1489 break;
1539 case TCP_LAST_ACK: 1490 case TCP_LAST_ACK:
1540 set_bit(XPRT_CLOSING, &xprt->state); 1491 set_bit(XPRT_CLOSING, &xprt->state);
1541 xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
1542 smp_mb__before_atomic(); 1492 smp_mb__before_atomic();
1543 clear_bit(XPRT_CONNECTED, &xprt->state); 1493 clear_bit(XPRT_CONNECTED, &xprt->state);
1544 smp_mb__after_atomic(); 1494 smp_mb__after_atomic();
1545 break; 1495 break;
1546 case TCP_CLOSE: 1496 case TCP_CLOSE:
1547 xs_tcp_cancel_linger_timeout(xprt);
1548 xs_sock_mark_closed(xprt); 1497 xs_sock_mark_closed(xprt);
1549 } 1498 }
1550 out: 1499 out:
@@ -1667,6 +1616,40 @@ static unsigned short xs_get_random_port(void)
1667} 1616}
1668 1617
1669/** 1618/**
1619 * xs_set_reuseaddr_port - set the socket's port and address reuse options
1620 * @sock: socket
1621 *
1622 * Note that this function has to be called on all sockets that share the
1623 * same port, and it must be called before binding.
1624 */
1625static void xs_sock_set_reuseport(struct socket *sock)
1626{
1627 int opt = 1;
1628
1629 kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT,
1630 (char *)&opt, sizeof(opt));
1631}
1632
1633static unsigned short xs_sock_getport(struct socket *sock)
1634{
1635 struct sockaddr_storage buf;
1636 int buflen;
1637 unsigned short port = 0;
1638
1639 if (kernel_getsockname(sock, (struct sockaddr *)&buf, &buflen) < 0)
1640 goto out;
1641 switch (buf.ss_family) {
1642 case AF_INET6:
1643 port = ntohs(((struct sockaddr_in6 *)&buf)->sin6_port);
1644 break;
1645 case AF_INET:
1646 port = ntohs(((struct sockaddr_in *)&buf)->sin_port);
1647 }
1648out:
1649 return port;
1650}
1651
1652/**
1670 * xs_set_port - reset the port number in the remote endpoint address 1653 * xs_set_port - reset the port number in the remote endpoint address
1671 * @xprt: generic transport 1654 * @xprt: generic transport
1672 * @port: new port number 1655 * @port: new port number
@@ -1680,6 +1663,12 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
1680 xs_update_peer_port(xprt); 1663 xs_update_peer_port(xprt);
1681} 1664}
1682 1665
1666static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
1667{
1668 if (transport->srcport == 0)
1669 transport->srcport = xs_sock_getport(sock);
1670}
1671
1683static unsigned short xs_get_srcport(struct sock_xprt *transport) 1672static unsigned short xs_get_srcport(struct sock_xprt *transport)
1684{ 1673{
1685 unsigned short port = transport->srcport; 1674 unsigned short port = transport->srcport;
@@ -1833,7 +1822,8 @@ static void xs_dummy_setup_socket(struct work_struct *work)
1833} 1822}
1834 1823
1835static struct socket *xs_create_sock(struct rpc_xprt *xprt, 1824static struct socket *xs_create_sock(struct rpc_xprt *xprt,
1836 struct sock_xprt *transport, int family, int type, int protocol) 1825 struct sock_xprt *transport, int family, int type,
1826 int protocol, bool reuseport)
1837{ 1827{
1838 struct socket *sock; 1828 struct socket *sock;
1839 int err; 1829 int err;
@@ -1846,6 +1836,9 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt,
1846 } 1836 }
1847 xs_reclassify_socket(family, sock); 1837 xs_reclassify_socket(family, sock);
1848 1838
1839 if (reuseport)
1840 xs_sock_set_reuseport(sock);
1841
1849 err = xs_bind(transport, sock); 1842 err = xs_bind(transport, sock);
1850 if (err) { 1843 if (err) {
1851 sock_release(sock); 1844 sock_release(sock);
@@ -1903,7 +1896,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
1903 struct socket *sock; 1896 struct socket *sock;
1904 int status = -EIO; 1897 int status = -EIO;
1905 1898
1906 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1907 status = __sock_create(xprt->xprt_net, AF_LOCAL, 1899 status = __sock_create(xprt->xprt_net, AF_LOCAL,
1908 SOCK_STREAM, 0, &sock, 1); 1900 SOCK_STREAM, 0, &sock, 1);
1909 if (status < 0) { 1901 if (status < 0) {
@@ -2044,10 +2036,9 @@ static void xs_udp_setup_socket(struct work_struct *work)
2044 struct socket *sock = transport->sock; 2036 struct socket *sock = transport->sock;
2045 int status = -EIO; 2037 int status = -EIO;
2046 2038
2047 /* Start by resetting any existing state */
2048 xs_reset_transport(transport);
2049 sock = xs_create_sock(xprt, transport, 2039 sock = xs_create_sock(xprt, transport,
2050 xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP); 2040 xs_addr(xprt)->sa_family, SOCK_DGRAM,
2041 IPPROTO_UDP, false);
2051 if (IS_ERR(sock)) 2042 if (IS_ERR(sock))
2052 goto out; 2043 goto out;
2053 2044
@@ -2061,61 +2052,11 @@ static void xs_udp_setup_socket(struct work_struct *work)
2061 trace_rpc_socket_connect(xprt, sock, 0); 2052 trace_rpc_socket_connect(xprt, sock, 0);
2062 status = 0; 2053 status = 0;
2063out: 2054out:
2055 xprt_unlock_connect(xprt, transport);
2064 xprt_clear_connecting(xprt); 2056 xprt_clear_connecting(xprt);
2065 xprt_wake_pending_tasks(xprt, status); 2057 xprt_wake_pending_tasks(xprt, status);
2066} 2058}
2067 2059
2068/*
2069 * We need to preserve the port number so the reply cache on the server can
2070 * find our cached RPC replies when we get around to reconnecting.
2071 */
2072static void xs_abort_connection(struct sock_xprt *transport)
2073{
2074 int result;
2075 struct sockaddr any;
2076
2077 dprintk("RPC: disconnecting xprt %p to reuse port\n", transport);
2078
2079 /*
2080 * Disconnect the transport socket by doing a connect operation
2081 * with AF_UNSPEC. This should return immediately...
2082 */
2083 memset(&any, 0, sizeof(any));
2084 any.sa_family = AF_UNSPEC;
2085 result = kernel_connect(transport->sock, &any, sizeof(any), 0);
2086 trace_rpc_socket_reset_connection(&transport->xprt,
2087 transport->sock, result);
2088 if (!result)
2089 xs_sock_reset_connection_flags(&transport->xprt);
2090 dprintk("RPC: AF_UNSPEC connect return code %d\n", result);
2091}
2092
2093static void xs_tcp_reuse_connection(struct sock_xprt *transport)
2094{
2095 unsigned int state = transport->inet->sk_state;
2096
2097 if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) {
2098 /* we don't need to abort the connection if the socket
2099 * hasn't undergone a shutdown
2100 */
2101 if (transport->inet->sk_shutdown == 0)
2102 return;
2103 dprintk("RPC: %s: TCP_CLOSEd and sk_shutdown set to %d\n",
2104 __func__, transport->inet->sk_shutdown);
2105 }
2106 if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) {
2107 /* we don't need to abort the connection if the socket
2108 * hasn't undergone a shutdown
2109 */
2110 if (transport->inet->sk_shutdown == 0)
2111 return;
2112 dprintk("RPC: %s: ESTABLISHED/SYN_SENT "
2113 "sk_shutdown set to %d\n",
2114 __func__, transport->inet->sk_shutdown);
2115 }
2116 xs_abort_connection(transport);
2117}
2118
2119static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 2060static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2120{ 2061{
2121 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 2062 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2149,9 +2090,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2149 sk->sk_allocation = GFP_ATOMIC; 2090 sk->sk_allocation = GFP_ATOMIC;
2150 2091
2151 /* socket options */ 2092 /* socket options */
2152 sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
2153 sock_reset_flag(sk, SOCK_LINGER); 2093 sock_reset_flag(sk, SOCK_LINGER);
2154 tcp_sk(sk)->linger2 = 0;
2155 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; 2094 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
2156 2095
2157 xprt_clear_connected(xprt); 2096 xprt_clear_connected(xprt);
@@ -2174,6 +2113,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2174 ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); 2113 ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
2175 switch (ret) { 2114 switch (ret) {
2176 case 0: 2115 case 0:
2116 xs_set_srcport(transport, sock);
2177 case -EINPROGRESS: 2117 case -EINPROGRESS:
2178 /* SYN_SENT! */ 2118 /* SYN_SENT! */
2179 if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) 2119 if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
@@ -2200,25 +2140,13 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2200 int status = -EIO; 2140 int status = -EIO;
2201 2141
2202 if (!sock) { 2142 if (!sock) {
2203 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
2204 sock = xs_create_sock(xprt, transport, 2143 sock = xs_create_sock(xprt, transport,
2205 xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP); 2144 xs_addr(xprt)->sa_family, SOCK_STREAM,
2145 IPPROTO_TCP, true);
2206 if (IS_ERR(sock)) { 2146 if (IS_ERR(sock)) {
2207 status = PTR_ERR(sock); 2147 status = PTR_ERR(sock);
2208 goto out; 2148 goto out;
2209 } 2149 }
2210 } else {
2211 int abort_and_exit;
2212
2213 abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
2214 &xprt->state);
2215 /* "close" the socket, preserving the local port */
2216 set_bit(XPRT_CONNECTION_REUSE, &xprt->state);
2217 xs_tcp_reuse_connection(transport);
2218 clear_bit(XPRT_CONNECTION_REUSE, &xprt->state);
2219
2220 if (abort_and_exit)
2221 goto out_eagain;
2222 } 2150 }
2223 2151
2224 dprintk("RPC: worker connecting xprt %p via %s to " 2152 dprintk("RPC: worker connecting xprt %p via %s to "
@@ -2245,6 +2173,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2245 case 0: 2173 case 0:
2246 case -EINPROGRESS: 2174 case -EINPROGRESS:
2247 case -EALREADY: 2175 case -EALREADY:
2176 xprt_unlock_connect(xprt, transport);
2248 xprt_clear_connecting(xprt); 2177 xprt_clear_connecting(xprt);
2249 return; 2178 return;
2250 case -EINVAL: 2179 case -EINVAL:
@@ -2254,13 +2183,15 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2254 case -ECONNREFUSED: 2183 case -ECONNREFUSED:
2255 case -ECONNRESET: 2184 case -ECONNRESET:
2256 case -ENETUNREACH: 2185 case -ENETUNREACH:
2186 case -EADDRINUSE:
2257 case -ENOBUFS: 2187 case -ENOBUFS:
2258 /* retry with existing socket, after a delay */ 2188 /* retry with existing socket, after a delay */
2189 xs_tcp_force_close(xprt);
2259 goto out; 2190 goto out;
2260 } 2191 }
2261out_eagain:
2262 status = -EAGAIN; 2192 status = -EAGAIN;
2263out: 2193out:
2194 xprt_unlock_connect(xprt, transport);
2264 xprt_clear_connecting(xprt); 2195 xprt_clear_connecting(xprt);
2265 xprt_wake_pending_tasks(xprt, status); 2196 xprt_wake_pending_tasks(xprt, status);
2266} 2197}
@@ -2283,6 +2214,11 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
2283{ 2214{
2284 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 2215 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
2285 2216
2217 WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
2218
2219 /* Start by resetting any existing state */
2220 xs_reset_transport(transport);
2221
2286 if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { 2222 if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
2287 dprintk("RPC: xs_connect delayed xprt %p for %lu " 2223 dprintk("RPC: xs_connect delayed xprt %p for %lu "
2288 "seconds\n", 2224 "seconds\n",
@@ -2559,7 +2495,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
2559 .buf_free = rpc_free, 2495 .buf_free = rpc_free,
2560 .send_request = xs_tcp_send_request, 2496 .send_request = xs_tcp_send_request,
2561 .set_retrans_timeout = xprt_set_retrans_timeout_def, 2497 .set_retrans_timeout = xprt_set_retrans_timeout_def,
2562 .close = xs_tcp_close, 2498 .close = xs_tcp_shutdown,
2563 .destroy = xs_destroy, 2499 .destroy = xs_destroy,
2564 .print_stats = xs_tcp_print_stats, 2500 .print_stats = xs_tcp_print_stats,
2565}; 2501};