aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-07 17:02:24 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-07 17:02:24 -0400
commit4e4adb2f462889b9eac736dd06d60658beb091b6 (patch)
tree3582dab57d97bbb30add005b3b2f8a8d8412121e /fs
parent77a78806c7df8d414c33031a1ca5121876910c4f (diff)
parent5445b1fbd123420bffed5e629a420aa2a16bf849 (diff)
Merge tag 'nfs-for-4.3-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: Stable patches: - Fix atomicity of pNFS commit list updates - Fix NFSv4 handling of open(O_CREAT|O_EXCL|O_RDONLY) - nfs_set_pgio_error sometimes misses errors - Fix a thinko in xs_connect() - Fix borkage in _same_data_server_addrs_locked() - Fix a NULL pointer dereference of migration recovery ops for v4.2 client - Don't let the ctime override attribute barriers. - Revert "NFSv4: Remove incorrect check in can_open_delegated()" - Ensure flexfiles pNFS driver updates the inode after write finishes - flexfiles must not pollute the attribute cache with attrbutes from the DS - Fix a protocol error in layoutreturn - Fix a protocol issue with NFSv4.1 CLOSE stateids Bugfixes + cleanups - pNFS blocks bugfixes from Christoph - Various cleanups from Anna - More fixes for delegation corner cases - Don't fsync twice for O_SYNC/IS_SYNC files - Fix pNFS and flexfiles layoutstats bugs - pnfs/flexfiles: avoid duplicate tracking of mirror data - pnfs: Fix layoutget/layoutreturn/return-on-close serialisation issues - pnfs/flexfiles: error handling retries a layoutget before fallback to MDS Features: - Full support for the OPEN NFS4_CREATE_EXCLUSIVE4_1 mode from Kinglong - More RDMA client transport improvements from Chuck - Removal of the deprecated ib_reg_phys_mr() and ib_rereg_phys_mr() verbs from the SUNRPC, Lustre and core infiniband tree. - Optimise away the close-to-open getattr if there is no cached data" * tag 'nfs-for-4.3-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (108 commits) NFSv4: Respect the server imposed limit on how many changes we may cache NFSv4: Express delegation limit in units of pages Revert "NFS: Make close(2) asynchronous when closing NFS O_DIRECT files" NFS: Optimise away the close-to-open getattr if there is no cached data NFSv4.1/flexfiles: Clean up ff_layout_write_done_cb/ff_layout_commit_done_cb NFSv4.1/flexfiles: Mark the layout for return in ff_layout_io_track_ds_error() nfs: Remove unneeded checking of the return value from scnprintf nfs: Fix truncated client owner id without proto type NFSv4.1/flexfiles: Mark layout for return if the mirrors are invalid NFSv4.1/flexfiles: RW layouts are valid only if all mirrors are valid NFSv4.1/flexfiles: Fix incorrect usage of pnfs_generic_mark_devid_invalid() NFSv4.1/flexfiles: Fix freeing of mirrors NFSv4.1/pNFS: Don't request a minimal read layout beyond the end of file NFSv4.1/pnfs: Handle LAYOUTGET return values correctly NFSv4.1/pnfs: Don't ask for a read layout for an empty file. NFSv4.1: Fix a protocol issue with CLOSE stateids NFSv4.1/flexfiles: Don't mark the entire deviceid as bad for file errors SUNRPC: Prevent SYN+SYNACK+RST storms SUNRPC: xs_reset_transport must mark the connection as disconnected NFSv4.1/pnfs: Ensure layoutreturn reserves space for the opaque payload ...
Diffstat (limited to 'fs')
-rw-r--r--fs/nfs/blocklayout/blocklayout.h19
-rw-r--r--fs/nfs/blocklayout/dev.c9
-rw-r--r--fs/nfs/blocklayout/extent_tree.c19
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_proc.c9
-rw-r--r--fs/nfs/client.c113
-rw-r--r--fs/nfs/delegation.c29
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/file.c21
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c424
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c82
-rw-r--r--fs/nfs/inode.c61
-rw-r--r--fs/nfs/internal.h20
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs42.h2
-rw-r--r--fs/nfs/nfs42xdr.c5
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c32
-rw-r--r--fs/nfs/nfs4idmap.c14
-rw-r--r--fs/nfs/nfs4proc.c136
-rw-r--r--fs/nfs/nfs4state.c12
-rw-r--r--fs/nfs/nfs4trace.h61
-rw-r--r--fs/nfs/nfs4xdr.c75
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/pnfs.c227
-rw-r--r--fs/nfs/pnfs.h48
-rw-r--r--fs/nfs/pnfs_nfs.c88
-rw-r--r--fs/nfs/super.c7
-rw-r--r--fs/nfs/write.c36
-rw-r--r--fs/nfsd/blocklayoutxdr.c2
-rw-r--r--fs/nfsd/blocklayoutxdr.h15
34 files changed, 967 insertions, 645 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 92dca9e90d8d..c556640dcf3b 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -46,13 +46,6 @@
46 46
47struct pnfs_block_dev; 47struct pnfs_block_dev;
48 48
49enum pnfs_block_volume_type {
50 PNFS_BLOCK_VOLUME_SIMPLE = 0,
51 PNFS_BLOCK_VOLUME_SLICE = 1,
52 PNFS_BLOCK_VOLUME_CONCAT = 2,
53 PNFS_BLOCK_VOLUME_STRIPE = 3,
54};
55
56#define PNFS_BLOCK_MAX_UUIDS 4 49#define PNFS_BLOCK_MAX_UUIDS 4
57#define PNFS_BLOCK_MAX_DEVICES 64 50#define PNFS_BLOCK_MAX_DEVICES 64
58 51
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
117 struct pnfs_block_dev_map *map); 110 struct pnfs_block_dev_map *map);
118}; 111};
119 112
120enum exstate4 {
121 PNFS_BLOCK_READWRITE_DATA = 0,
122 PNFS_BLOCK_READ_DATA = 1,
123 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
124 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
125};
126
127/* sector_t fields are all in 512-byte sectors */ 113/* sector_t fields are all in 512-byte sectors */
128struct pnfs_block_extent { 114struct pnfs_block_extent {
129 union { 115 union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
134 sector_t be_f_offset; /* the starting offset in the file */ 120 sector_t be_f_offset; /* the starting offset in the file */
135 sector_t be_length; /* the size of the extent */ 121 sector_t be_length; /* the size of the extent */
136 sector_t be_v_offset; /* the starting offset in the volume */ 122 sector_t be_v_offset; /* the starting offset in the volume */
137 enum exstate4 be_state; /* the state of this extent */ 123 enum pnfs_block_extent_state be_state; /* the state of this extent */
138#define EXTENT_WRITTEN 1 124#define EXTENT_WRITTEN 1
139#define EXTENT_COMMITTING 2 125#define EXTENT_COMMITTING 2
140 unsigned int be_tag; 126 unsigned int be_tag;
141}; 127};
142 128
143/* on the wire size of the extent */
144#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
145
146struct pnfs_block_layout { 129struct pnfs_block_layout {
147 struct pnfs_layout_hdr bl_layout; 130 struct pnfs_layout_hdr bl_layout;
148 struct rb_root bl_ext_rw; 131 struct rb_root bl_ext_rw;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e535599a0719..a861bbdfe577 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev)
22 kfree(dev->children); 22 kfree(dev->children);
23 } else { 23 } else {
24 if (dev->bdev) 24 if (dev->bdev)
25 blkdev_put(dev->bdev, FMODE_READ); 25 blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
26 } 26 }
27} 27}
28 28
@@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
65 return -EIO; 65 return -EIO;
66 p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); 66 p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
67 b->simple.sigs[i].sig_len = be32_to_cpup(p++); 67 b->simple.sigs[i].sig_len = be32_to_cpup(p++);
68 if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
69 pr_info("signature too long: %d\n",
70 b->simple.sigs[i].sig_len);
71 return -EIO;
72 }
68 73
69 p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); 74 p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
70 if (!p) 75 if (!p)
@@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
195 if (!dev) 200 if (!dev)
196 return -EIO; 201 return -EIO;
197 202
198 d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); 203 d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
199 if (IS_ERR(d->bdev)) { 204 if (IS_ERR(d->bdev)) {
200 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", 205 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
201 MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); 206 MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 31d0b5e53dfd..c59a59c37f3d 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -462,6 +462,12 @@ out:
462 return err; 462 return err;
463} 463}
464 464
465static size_t ext_tree_layoutupdate_size(size_t count)
466{
467 return sizeof(__be32) /* number of entries */ +
468 PNFS_BLOCK_EXTENT_SIZE * count;
469}
470
465static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, 471static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
466 size_t buffer_size) 472 size_t buffer_size)
467{ 473{
@@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
489 continue; 495 continue;
490 496
491 (*count)++; 497 (*count)++;
492 if (*count * BL_EXTENT_SIZE > buffer_size) { 498 if (ext_tree_layoutupdate_size(*count) > buffer_size) {
493 /* keep counting.. */ 499 /* keep counting.. */
494 ret = -ENOSPC; 500 ret = -ENOSPC;
495 continue; 501 continue;
@@ -530,7 +536,7 @@ retry:
530 if (unlikely(ret)) { 536 if (unlikely(ret)) {
531 ext_tree_free_commitdata(arg, buffer_size); 537 ext_tree_free_commitdata(arg, buffer_size);
532 538
533 buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; 539 buffer_size = ext_tree_layoutupdate_size(count);
534 count = 0; 540 count = 0;
535 541
536 arg->layoutupdate_pages = 542 arg->layoutupdate_pages =
@@ -549,17 +555,14 @@ retry:
549 } 555 }
550 556
551 *start_p = cpu_to_be32(count); 557 *start_p = cpu_to_be32(count);
552 arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; 558 arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
553 559
554 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { 560 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
555 __be32 *p = start_p; 561 void *p = start_p, *end = p + arg->layoutupdate_len;
556 int i = 0; 562 int i = 0;
557 563
558 for (p = start_p; 564 for ( ; p < end; p += PAGE_SIZE)
559 p < start_p + arg->layoutupdate_len;
560 p += PAGE_SIZE) {
561 arg->layoutupdate_pages[i++] = vmalloc_to_page(p); 565 arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
562 }
563 } 566 }
564 567
565 dprintk("%s found %zu ranges\n", __func__, count); 568 dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 2c4a0b565d28..75f7c0a7538a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv)
162 spin_lock_init(&serv->sv_cb_lock); 162 spin_lock_init(&serv->sv_cb_lock);
163 init_waitqueue_head(&serv->sv_cb_waitq); 163 init_waitqueue_head(&serv->sv_cb_waitq);
164 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); 164 rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
165 if (IS_ERR(rqstp)) {
166 svc_xprt_put(serv->sv_bc_xprt);
167 serv->sv_bc_xprt = NULL;
168 }
169 dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); 165 dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
170 return rqstp; 166 return rqstp;
171} 167}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 29e3c1b011b7..b85cf7a30232 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
40 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 40 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
41 41
42 inode = nfs_delegation_find_inode(cps->clp, &args->fh); 42 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
43 if (inode == NULL) 43 if (inode == NULL) {
44 trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
45 -ntohl(res->status));
44 goto out; 46 goto out;
47 }
45 nfsi = NFS_I(inode); 48 nfsi = NFS_I(inode);
46 rcu_read_lock(); 49 rcu_read_lock();
47 delegation = rcu_dereference(nfsi->delegation); 50 delegation = rcu_dereference(nfsi->delegation);
@@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
60 res->status = 0; 63 res->status = 0;
61out_iput: 64out_iput:
62 rcu_read_unlock(); 65 rcu_read_unlock();
66 trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
63 iput(inode); 67 iput(inode);
64out: 68out:
65 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); 69 dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
@@ -194,6 +198,7 @@ unlock:
194 spin_unlock(&ino->i_lock); 198 spin_unlock(&ino->i_lock);
195 pnfs_free_lseg_list(&free_me_list); 199 pnfs_free_lseg_list(&free_me_list);
196 pnfs_put_layout_hdr(lo); 200 pnfs_put_layout_hdr(lo);
201 trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
197 iput(ino); 202 iput(ino);
198out: 203out:
199 return rv; 204 return rv;
@@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
554 status = htonl(NFS4_OK); 559 status = htonl(NFS4_OK);
555 560
556 nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); 561 nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
557 nfs41_server_notify_target_slotid_update(cps->clp); 562 nfs41_notify_server(cps->clp);
558out: 563out:
559 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 564 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
560 return status; 565 return status;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4a90c9bb3135..57c5a02f6213 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -20,6 +20,7 @@
20#include <linux/stat.h> 20#include <linux/stat.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/unistd.h> 22#include <linux/unistd.h>
23#include <linux/sunrpc/addr.h>
23#include <linux/sunrpc/clnt.h> 24#include <linux/sunrpc/clnt.h>
24#include <linux/sunrpc/stats.h> 25#include <linux/sunrpc/stats.h>
25#include <linux/sunrpc/metrics.h> 26#include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp)
285} 286}
286EXPORT_SYMBOL_GPL(nfs_put_client); 287EXPORT_SYMBOL_GPL(nfs_put_client);
287 288
288#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
289/*
290 * Test if two ip6 socket addresses refer to the same socket by
291 * comparing relevant fields. The padding bytes specifically, are not
292 * compared. sin6_flowinfo is not compared because it only affects QoS
293 * and sin6_scope_id is only compared if the address is "link local"
294 * because "link local" addresses need only be unique to a specific
295 * link. Conversely, ordinary unicast addresses might have different
296 * sin6_scope_id.
297 *
298 * The caller should ensure both socket addresses are AF_INET6.
299 */
300static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
301 const struct sockaddr *sa2)
302{
303 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
304 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
305
306 if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
307 return 0;
308 else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
309 return sin1->sin6_scope_id == sin2->sin6_scope_id;
310
311 return 1;
312}
313#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
314static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
315 const struct sockaddr *sa2)
316{
317 return 0;
318}
319#endif
320
321/*
322 * Test if two ip4 socket addresses refer to the same socket, by
323 * comparing relevant fields. The padding bytes specifically, are
324 * not compared.
325 *
326 * The caller should ensure both socket addresses are AF_INET.
327 */
328static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
329 const struct sockaddr *sa2)
330{
331 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
332 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
333
334 return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
335}
336
337static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
338 const struct sockaddr *sa2)
339{
340 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
341 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
342
343 return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
344 (sin1->sin6_port == sin2->sin6_port);
345}
346
347static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
348 const struct sockaddr *sa2)
349{
350 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
351 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
352
353 return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
354 (sin1->sin_port == sin2->sin_port);
355}
356
357#if defined(CONFIG_NFS_V4_1)
358/*
359 * Test if two socket addresses represent the same actual socket,
360 * by comparing (only) relevant fields, excluding the port number.
361 */
362int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
363 const struct sockaddr *sa2)
364{
365 if (sa1->sa_family != sa2->sa_family)
366 return 0;
367
368 switch (sa1->sa_family) {
369 case AF_INET:
370 return nfs_sockaddr_match_ipaddr4(sa1, sa2);
371 case AF_INET6:
372 return nfs_sockaddr_match_ipaddr6(sa1, sa2);
373 }
374 return 0;
375}
376EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
377#endif /* CONFIG_NFS_V4_1 */
378
379/*
380 * Test if two socket addresses represent the same actual socket,
381 * by comparing (only) relevant fields, including the port number.
382 */
383static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
384 const struct sockaddr *sa2)
385{
386 if (sa1->sa_family != sa2->sa_family)
387 return 0;
388
389 switch (sa1->sa_family) {
390 case AF_INET:
391 return nfs_sockaddr_cmp_ip4(sa1, sa2);
392 case AF_INET6:
393 return nfs_sockaddr_cmp_ip6(sa1, sa2);
394 }
395 return 0;
396}
397
398/* 289/*
399 * Find an nfs_client on the list that matches the initialisation data 290 * Find an nfs_client on the list that matches the initialisation data
400 * that is supplied. 291 * that is supplied.
@@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
421 if (clp->cl_minorversion != data->minorversion) 312 if (clp->cl_minorversion != data->minorversion)
422 continue; 313 continue;
423 /* Match the full socket address */ 314 /* Match the full socket address */
424 if (!nfs_sockaddr_cmp(sap, clap)) 315 if (!rpc_cmp_addr_port(sap, clap))
425 continue; 316 continue;
426 317
427 atomic_inc(&clp->cl_count); 318 atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 029d688a969f..2714ef835bdd 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
175 if (delegation->inode != NULL) { 175 if (delegation->inode != NULL) {
176 nfs4_stateid_copy(&delegation->stateid, &res->delegation); 176 nfs4_stateid_copy(&delegation->stateid, &res->delegation);
177 delegation->type = res->delegation_type; 177 delegation->type = res->delegation_type;
178 delegation->maxsize = res->maxsize; 178 delegation->pagemod_limit = res->pagemod_limit;
179 oldcred = delegation->cred; 179 oldcred = delegation->cred;
180 delegation->cred = get_rpccred(cred); 180 delegation->cred = get_rpccred(cred);
181 clear_bit(NFS_DELEGATION_NEED_RECLAIM, 181 clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
337 return -ENOMEM; 337 return -ENOMEM;
338 nfs4_stateid_copy(&delegation->stateid, &res->delegation); 338 nfs4_stateid_copy(&delegation->stateid, &res->delegation);
339 delegation->type = res->delegation_type; 339 delegation->type = res->delegation_type;
340 delegation->maxsize = res->maxsize; 340 delegation->pagemod_limit = res->pagemod_limit;
341 delegation->change_attr = inode->i_version; 341 delegation->change_attr = inode->i_version;
342 delegation->cred = get_rpccred(cred); 342 delegation->cred = get_rpccred(cred);
343 delegation->inode = inode; 343 delegation->inode = inode;
@@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
900 rcu_read_unlock(); 900 rcu_read_unlock();
901 return ret; 901 return ret;
902} 902}
903
904/**
905 * nfs4_delegation_flush_on_close - Check if we must flush file on close
906 * @inode: inode to check
907 *
908 * This function checks the number of outstanding writes to the file
909 * against the delegation 'space_limit' field to see if
910 * the spec requires us to flush the file on close.
911 */
912bool nfs4_delegation_flush_on_close(const struct inode *inode)
913{
914 struct nfs_inode *nfsi = NFS_I(inode);
915 struct nfs_delegation *delegation;
916 bool ret = true;
917
918 rcu_read_lock();
919 delegation = rcu_dereference(nfsi->delegation);
920 if (delegation == NULL || !(delegation->type & FMODE_WRITE))
921 goto out;
922 if (nfsi->nrequests < delegation->pagemod_limit)
923 ret = false;
924out:
925 rcu_read_unlock();
926 return ret;
927}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index e3c20a3ccc93..a44829173e57 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -18,7 +18,7 @@ struct nfs_delegation {
18 struct inode *inode; 18 struct inode *inode;
19 nfs4_stateid stateid; 19 nfs4_stateid stateid;
20 fmode_t type; 20 fmode_t type;
21 loff_t maxsize; 21 unsigned long pagemod_limit;
22 __u64 change_attr; 22 __u64 change_attr;
23 unsigned long flags; 23 unsigned long flags;
24 spinlock_t lock; 24 spinlock_t lock;
@@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
61void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); 61void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
62int nfs4_have_delegation(struct inode *inode, fmode_t flags); 62int nfs4_have_delegation(struct inode *inode, fmode_t flags);
63int nfs4_check_delegation(struct inode *inode, fmode_t flags); 63int nfs4_check_delegation(struct inode *inode, fmode_t flags);
64bool nfs4_delegation_flush_on_close(const struct inode *inode);
64 65
65#endif 66#endif
66 67
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 547308a5ec6f..3d8e4ffa0a33 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -583,26 +583,19 @@ out_nopages:
583} 583}
584 584
585static 585static
586void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) 586void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
587{ 587{
588 unsigned int i; 588 unsigned int i;
589 for (i = 0; i < npages; i++) 589 for (i = 0; i < npages; i++)
590 put_page(pages[i]); 590 put_page(pages[i]);
591} 591}
592 592
593static
594void nfs_readdir_free_large_page(void *ptr, struct page **pages,
595 unsigned int npages)
596{
597 nfs_readdir_free_pagearray(pages, npages);
598}
599
600/* 593/*
601 * nfs_readdir_large_page will allocate pages that must be freed with a call 594 * nfs_readdir_large_page will allocate pages that must be freed with a call
602 * to nfs_readdir_free_large_page 595 * to nfs_readdir_free_pagearray
603 */ 596 */
604static 597static
605int nfs_readdir_large_page(struct page **pages, unsigned int npages) 598int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
606{ 599{
607 unsigned int i; 600 unsigned int i;
608 601
@@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages)
615 return 0; 608 return 0;
616 609
617out_freepages: 610out_freepages:
618 nfs_readdir_free_pagearray(pages, i); 611 nfs_readdir_free_pages(pages, i);
619 return -ENOMEM; 612 return -ENOMEM;
620} 613}
621 614
@@ -623,7 +616,6 @@ static
623int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) 616int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
624{ 617{
625 struct page *pages[NFS_MAX_READDIR_PAGES]; 618 struct page *pages[NFS_MAX_READDIR_PAGES];
626 void *pages_ptr = NULL;
627 struct nfs_entry entry; 619 struct nfs_entry entry;
628 struct file *file = desc->file; 620 struct file *file = desc->file;
629 struct nfs_cache_array *array; 621 struct nfs_cache_array *array;
@@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
653 memset(array, 0, sizeof(struct nfs_cache_array)); 645 memset(array, 0, sizeof(struct nfs_cache_array));
654 array->eof_index = -1; 646 array->eof_index = -1;
655 647
656 status = nfs_readdir_large_page(pages, array_size); 648 status = nfs_readdir_alloc_pages(pages, array_size);
657 if (status < 0) 649 if (status < 0)
658 goto out_release_array; 650 goto out_release_array;
659 do { 651 do {
@@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
671 } 663 }
672 } while (array->eof_index < 0); 664 } while (array->eof_index < 0);
673 665
674 nfs_readdir_free_large_page(pages_ptr, pages, array_size); 666 nfs_readdir_free_pages(pages, array_size);
675out_release_array: 667out_release_array:
676 nfs_readdir_release_array(page); 668 nfs_readdir_release_array(page);
677out_label_free: 669out_label_free:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc4fa1ed61fc..c0f9b1ed12b9 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
82 dprintk("NFS: release(%pD2)\n", filp); 82 dprintk("NFS: release(%pD2)\n", filp);
83 83
84 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 84 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
85 return nfs_release(inode, filp); 85 nfs_file_clear_open_context(filp);
86 return 0;
86} 87}
87EXPORT_SYMBOL_GPL(nfs_file_release); 88EXPORT_SYMBOL_GPL(nfs_file_release);
88 89
@@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
141/* 142/*
142 * Flush all dirty pages, and check for write errors. 143 * Flush all dirty pages, and check for write errors.
143 */ 144 */
144int 145static int
145nfs_file_flush(struct file *file, fl_owner_t id) 146nfs_file_flush(struct file *file, fl_owner_t id)
146{ 147{
147 struct inode *inode = file_inode(file); 148 struct inode *inode = file_inode(file);
@@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
152 if ((file->f_mode & FMODE_WRITE) == 0) 153 if ((file->f_mode & FMODE_WRITE) == 0)
153 return 0; 154 return 0;
154 155
155 /*
156 * If we're holding a write delegation, then just start the i/o
157 * but don't wait for completion (or send a commit).
158 */
159 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
160 return filemap_fdatawrite(file->f_mapping);
161
162 /* Flush writes to the server and return any errors */ 156 /* Flush writes to the server and return any errors */
163 return vfs_fsync(file, 0); 157 return vfs_fsync(file, 0);
164} 158}
165EXPORT_SYMBOL_GPL(nfs_file_flush);
166 159
167ssize_t 160ssize_t
168nfs_file_read(struct kiocb *iocb, struct iov_iter *to) 161nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
@@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
644 .page_mkwrite = nfs_vm_page_mkwrite, 637 .page_mkwrite = nfs_vm_page_mkwrite,
645}; 638};
646 639
647static int nfs_need_sync_write(struct file *filp, struct inode *inode) 640static int nfs_need_check_write(struct file *filp, struct inode *inode)
648{ 641{
649 struct nfs_open_context *ctx; 642 struct nfs_open_context *ctx;
650 643
651 if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
652 return 1;
653 ctx = nfs_file_open_context(filp); 644 ctx = nfs_file_open_context(filp);
654 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || 645 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
655 nfs_ctx_key_to_expire(ctx)) 646 nfs_ctx_key_to_expire(ctx))
@@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
699 if (result > 0) 690 if (result > 0)
700 written = result; 691 written = result;
701 692
702 /* Return error values for O_DSYNC and IS_SYNC() */ 693 /* Return error values */
703 if (result >= 0 && nfs_need_sync_write(file, inode)) { 694 if (result >= 0 && nfs_need_check_write(file, inode)) {
704 int err = vfs_fsync(file, 0); 695 int err = vfs_fsync(file, 0);
705 if (err < 0) 696 if (err < 0)
706 result = err; 697 result = err;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b3289d701eea..fbc5a56de875 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
34 ffl = kzalloc(sizeof(*ffl), gfp_flags); 34 ffl = kzalloc(sizeof(*ffl), gfp_flags);
35 if (ffl) { 35 if (ffl) {
36 INIT_LIST_HEAD(&ffl->error_list); 36 INIT_LIST_HEAD(&ffl->error_list);
37 INIT_LIST_HEAD(&ffl->mirrors);
37 return &ffl->generic_hdr; 38 return &ffl->generic_hdr;
38 } else 39 } else
39 return NULL; 40 return NULL;
@@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id)
135 return 0; 136 return 0;
136} 137}
137 138
139static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
140 const struct nfs4_ff_layout_mirror *m2)
141{
142 int i, j;
143
144 if (m1->fh_versions_cnt != m2->fh_versions_cnt)
145 return false;
146 for (i = 0; i < m1->fh_versions_cnt; i++) {
147 bool found_fh = false;
148 for (j = 0; j < m2->fh_versions_cnt; i++) {
149 if (nfs_compare_fh(&m1->fh_versions[i],
150 &m2->fh_versions[j]) == 0) {
151 found_fh = true;
152 break;
153 }
154 }
155 if (!found_fh)
156 return false;
157 }
158 return true;
159}
160
161static struct nfs4_ff_layout_mirror *
162ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
163 struct nfs4_ff_layout_mirror *mirror)
164{
165 struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
166 struct nfs4_ff_layout_mirror *pos;
167 struct inode *inode = lo->plh_inode;
168
169 spin_lock(&inode->i_lock);
170 list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
171 if (mirror->mirror_ds != pos->mirror_ds)
172 continue;
173 if (!ff_mirror_match_fh(mirror, pos))
174 continue;
175 if (atomic_inc_not_zero(&pos->ref)) {
176 spin_unlock(&inode->i_lock);
177 return pos;
178 }
179 }
180 list_add(&mirror->mirrors, &ff_layout->mirrors);
181 mirror->layout = lo;
182 spin_unlock(&inode->i_lock);
183 return mirror;
184}
185
186static void
187ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
188{
189 struct inode *inode;
190 if (mirror->layout == NULL)
191 return;
192 inode = mirror->layout->plh_inode;
193 spin_lock(&inode->i_lock);
194 list_del(&mirror->mirrors);
195 spin_unlock(&inode->i_lock);
196 mirror->layout = NULL;
197}
198
199static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
200{
201 struct nfs4_ff_layout_mirror *mirror;
202
203 mirror = kzalloc(sizeof(*mirror), gfp_flags);
204 if (mirror != NULL) {
205 spin_lock_init(&mirror->lock);
206 atomic_set(&mirror->ref, 1);
207 INIT_LIST_HEAD(&mirror->mirrors);
208 }
209 return mirror;
210}
211
212static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
213{
214 ff_layout_remove_mirror(mirror);
215 kfree(mirror->fh_versions);
216 if (mirror->cred)
217 put_rpccred(mirror->cred);
218 nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
219 kfree(mirror);
220}
221
222static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
223{
224 if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
225 ff_layout_free_mirror(mirror);
226}
227
138static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) 228static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
139{ 229{
140 int i; 230 int i;
@@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
144 /* normally mirror_ds is freed in 234 /* normally mirror_ds is freed in
145 * .free_deviceid_node but we still do it here 235 * .free_deviceid_node but we still do it here
146 * for .alloc_lseg error path */ 236 * for .alloc_lseg error path */
147 if (fls->mirror_array[i]) { 237 ff_layout_put_mirror(fls->mirror_array[i]);
148 kfree(fls->mirror_array[i]->fh_versions);
149 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
150 kfree(fls->mirror_array[i]);
151 }
152 } 238 }
153 kfree(fls->mirror_array); 239 kfree(fls->mirror_array);
154 fls->mirror_array = NULL; 240 fls->mirror_array = NULL;
@@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
181 } 267 }
182} 268}
183 269
270static bool
271ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
272 const struct pnfs_layout_range *l2)
273{
274 u64 end1, end2;
275
276 if (l1->iomode != l2->iomode)
277 return l1->iomode != IOMODE_READ;
278 end1 = pnfs_calc_offset_end(l1->offset, l1->length);
279 end2 = pnfs_calc_offset_end(l2->offset, l2->length);
280 if (end1 < l2->offset)
281 return false;
282 if (end2 < l1->offset)
283 return true;
284 return l2->offset <= l1->offset;
285}
286
287static bool
288ff_lseg_merge(struct pnfs_layout_segment *new,
289 struct pnfs_layout_segment *old)
290{
291 u64 new_end, old_end;
292
293 if (new->pls_range.iomode != old->pls_range.iomode)
294 return false;
295 old_end = pnfs_calc_offset_end(old->pls_range.offset,
296 old->pls_range.length);
297 if (old_end < new->pls_range.offset)
298 return false;
299 new_end = pnfs_calc_offset_end(new->pls_range.offset,
300 new->pls_range.length);
301 if (new_end < old->pls_range.offset)
302 return false;
303
304 /* Mergeable: copy info from 'old' to 'new' */
305 if (new_end < old_end)
306 new_end = old_end;
307 if (new->pls_range.offset < old->pls_range.offset)
308 new->pls_range.offset = old->pls_range.offset;
309 new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
310 new_end);
311 if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
312 set_bit(NFS_LSEG_ROC, &new->pls_flags);
313 if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
314 set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
315 return true;
316}
317
318static void
319ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
320 struct pnfs_layout_segment *lseg,
321 struct list_head *free_me)
322{
323 pnfs_generic_layout_insert_lseg(lo, lseg,
324 ff_lseg_range_is_after,
325 ff_lseg_merge,
326 free_me);
327}
328
184static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) 329static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
185{ 330{
186 int i, j; 331 int i, j;
@@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
246 goto out_err_free; 391 goto out_err_free;
247 392
248 for (i = 0; i < fls->mirror_array_cnt; i++) { 393 for (i = 0; i < fls->mirror_array_cnt; i++) {
394 struct nfs4_ff_layout_mirror *mirror;
249 struct nfs4_deviceid devid; 395 struct nfs4_deviceid devid;
250 struct nfs4_deviceid_node *idnode; 396 struct nfs4_deviceid_node *idnode;
251 u32 ds_count; 397 u32 ds_count;
@@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
262 if (ds_count != 1) 408 if (ds_count != 1)
263 goto out_err_free; 409 goto out_err_free;
264 410
265 fls->mirror_array[i] = 411 fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
266 kzalloc(sizeof(struct nfs4_ff_layout_mirror),
267 gfp_flags);
268 if (fls->mirror_array[i] == NULL) { 412 if (fls->mirror_array[i] == NULL) {
269 rc = -ENOMEM; 413 rc = -ENOMEM;
270 goto out_err_free; 414 goto out_err_free;
271 } 415 }
272 416
273 spin_lock_init(&fls->mirror_array[i]->lock);
274 fls->mirror_array[i]->ds_count = ds_count; 417 fls->mirror_array[i]->ds_count = ds_count;
275 fls->mirror_array[i]->lseg = &fls->generic_hdr;
276 418
277 /* deviceid */ 419 /* deviceid */
278 rc = decode_deviceid(&stream, &devid); 420 rc = decode_deviceid(&stream, &devid);
@@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
338 if (rc) 480 if (rc)
339 goto out_err_free; 481 goto out_err_free;
340 482
483 mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
484 if (mirror != fls->mirror_array[i]) {
485 ff_layout_free_mirror(fls->mirror_array[i]);
486 fls->mirror_array[i] = mirror;
487 }
488
341 dprintk("%s: uid %d gid %d\n", __func__, 489 dprintk("%s: uid %d gid %d\n", __func__,
342 fls->mirror_array[i]->uid, 490 fls->mirror_array[i]->uid,
343 fls->mirror_array[i]->gid); 491 fls->mirror_array[i]->gid);
@@ -379,21 +527,9 @@ static void
379ff_layout_free_lseg(struct pnfs_layout_segment *lseg) 527ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
380{ 528{
381 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); 529 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
382 int i;
383 530
384 dprintk("--> %s\n", __func__); 531 dprintk("--> %s\n", __func__);
385 532
386 for (i = 0; i < fls->mirror_array_cnt; i++) {
387 if (fls->mirror_array[i]) {
388 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
389 fls->mirror_array[i]->mirror_ds = NULL;
390 if (fls->mirror_array[i]->cred) {
391 put_rpccred(fls->mirror_array[i]->cred);
392 fls->mirror_array[i]->cred = NULL;
393 }
394 }
395 }
396
397 if (lseg->pls_range.iomode == IOMODE_RW) { 533 if (lseg->pls_range.iomode == IOMODE_RW) {
398 struct nfs4_flexfile_layout *ffl; 534 struct nfs4_flexfile_layout *ffl;
399 struct inode *inode; 535 struct inode *inode;
@@ -419,48 +555,44 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
419} 555}
420 556
421static void 557static void
422nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer) 558nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
423{ 559{
424 /* first IO request? */ 560 /* first IO request? */
425 if (atomic_inc_return(&timer->n_ops) == 1) { 561 if (atomic_inc_return(&timer->n_ops) == 1) {
426 timer->start_time = ktime_get(); 562 timer->start_time = now;
427 } 563 }
428} 564}
429 565
430static ktime_t 566static ktime_t
431nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer) 567nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
432{ 568{
433 ktime_t start, now; 569 ktime_t start;
434 570
435 if (atomic_dec_return(&timer->n_ops) < 0) 571 if (atomic_dec_return(&timer->n_ops) < 0)
436 WARN_ON_ONCE(1); 572 WARN_ON_ONCE(1);
437 573
438 now = ktime_get();
439 start = timer->start_time; 574 start = timer->start_time;
440 timer->start_time = now; 575 timer->start_time = now;
441 return ktime_sub(now, start); 576 return ktime_sub(now, start);
442} 577}
443 578
444static ktime_t
445nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
446{
447 return ktime_sub(ktime_get(), task->tk_start);
448}
449
450static bool 579static bool
451nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, 580nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
452 struct nfs4_ff_layoutstat *layoutstat) 581 struct nfs4_ff_layoutstat *layoutstat,
582 ktime_t now)
453{ 583{
454 static const ktime_t notime = {0}; 584 static const ktime_t notime = {0};
455 ktime_t now = ktime_get(); 585 s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
456 586
457 nfs4_ff_start_busy_timer(&layoutstat->busy_timer); 587 nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
458 if (ktime_equal(mirror->start_time, notime)) 588 if (ktime_equal(mirror->start_time, notime))
459 mirror->start_time = now; 589 mirror->start_time = now;
460 if (ktime_equal(mirror->last_report_time, notime)) 590 if (ktime_equal(mirror->last_report_time, notime))
461 mirror->last_report_time = now; 591 mirror->last_report_time = now;
592 if (layoutstats_timer != 0)
593 report_interval = (s64)layoutstats_timer * 1000LL;
462 if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= 594 if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
463 FF_LAYOUTSTATS_REPORT_INTERVAL) { 595 report_interval) {
464 mirror->last_report_time = now; 596 mirror->last_report_time = now;
465 return true; 597 return true;
466 } 598 }
@@ -482,35 +614,39 @@ static void
482nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, 614nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
483 __u64 requested, 615 __u64 requested,
484 __u64 completed, 616 __u64 completed,
485 ktime_t time_completed) 617 ktime_t time_completed,
618 ktime_t time_started)
486{ 619{
487 struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; 620 struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
621 ktime_t completion_time = ktime_sub(time_completed, time_started);
488 ktime_t timer; 622 ktime_t timer;
489 623
490 iostat->ops_completed++; 624 iostat->ops_completed++;
491 iostat->bytes_completed += completed; 625 iostat->bytes_completed += completed;
492 iostat->bytes_not_delivered += requested - completed; 626 iostat->bytes_not_delivered += requested - completed;
493 627
494 timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer); 628 timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
495 iostat->total_busy_time = 629 iostat->total_busy_time =
496 ktime_add(iostat->total_busy_time, timer); 630 ktime_add(iostat->total_busy_time, timer);
497 iostat->aggregate_completion_time = 631 iostat->aggregate_completion_time =
498 ktime_add(iostat->aggregate_completion_time, time_completed); 632 ktime_add(iostat->aggregate_completion_time,
633 completion_time);
499} 634}
500 635
501static void 636static void
502nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror, 637nfs4_ff_layout_stat_io_start_read(struct inode *inode,
503 __u64 requested) 638 struct nfs4_ff_layout_mirror *mirror,
639 __u64 requested, ktime_t now)
504{ 640{
505 bool report; 641 bool report;
506 642
507 spin_lock(&mirror->lock); 643 spin_lock(&mirror->lock);
508 report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat); 644 report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
509 nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); 645 nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
510 spin_unlock(&mirror->lock); 646 spin_unlock(&mirror->lock);
511 647
512 if (report) 648 if (report)
513 pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); 649 pnfs_report_layoutstat(inode, GFP_KERNEL);
514} 650}
515 651
516static void 652static void
@@ -522,23 +658,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
522 spin_lock(&mirror->lock); 658 spin_lock(&mirror->lock);
523 nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, 659 nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
524 requested, completed, 660 requested, completed,
525 nfs4_ff_layout_calc_completion_time(task)); 661 ktime_get(), task->tk_start);
526 spin_unlock(&mirror->lock); 662 spin_unlock(&mirror->lock);
527} 663}
528 664
529static void 665static void
530nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror, 666nfs4_ff_layout_stat_io_start_write(struct inode *inode,
531 __u64 requested) 667 struct nfs4_ff_layout_mirror *mirror,
668 __u64 requested, ktime_t now)
532{ 669{
533 bool report; 670 bool report;
534 671
535 spin_lock(&mirror->lock); 672 spin_lock(&mirror->lock);
536 report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat); 673 report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
537 nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); 674 nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
538 spin_unlock(&mirror->lock); 675 spin_unlock(&mirror->lock);
539 676
540 if (report) 677 if (report)
541 pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); 678 pnfs_report_layoutstat(inode, GFP_NOIO);
542} 679}
543 680
544static void 681static void
@@ -553,8 +690,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
553 690
554 spin_lock(&mirror->lock); 691 spin_lock(&mirror->lock);
555 nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, 692 nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
556 requested, completed, 693 requested, completed, ktime_get(), task->tk_start);
557 nfs4_ff_layout_calc_completion_time(task));
558 spin_unlock(&mirror->lock); 694 spin_unlock(&mirror->lock);
559} 695}
560 696
@@ -728,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
728 return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); 864 return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
729 865
730 /* no lseg means that pnfs is not in use, so no mirroring here */ 866 /* no lseg means that pnfs is not in use, so no mirroring here */
731 pnfs_put_lseg(pgio->pg_lseg);
732 pgio->pg_lseg = NULL;
733 nfs_pageio_reset_write_mds(pgio); 867 nfs_pageio_reset_write_mds(pgio);
734 return 1; 868 return 1;
735} 869}
@@ -931,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
931 if (task->tk_status >= 0) 1065 if (task->tk_status >= 0)
932 return 0; 1066 return 0;
933 1067
934 if (task->tk_status != -EJUKEBOX) { 1068 switch (task->tk_status) {
1069 /* File access problems. Don't mark the device as unavailable */
1070 case -EACCES:
1071 case -ESTALE:
1072 case -EISDIR:
1073 case -EBADHANDLE:
1074 case -ELOOP:
1075 case -ENOSPC:
1076 break;
1077 case -EJUKEBOX:
1078 nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
1079 goto out_retry;
1080 default:
935 dprintk("%s DS connection error %d\n", __func__, 1081 dprintk("%s DS connection error %d\n", __func__,
936 task->tk_status); 1082 task->tk_status);
937 nfs4_mark_deviceid_unavailable(devid); 1083 nfs4_mark_deviceid_unavailable(devid);
938 if (ff_layout_has_available_ds(lseg))
939 return -NFS4ERR_RESET_TO_PNFS;
940 else
941 return -NFS4ERR_RESET_TO_MDS;
942 } 1084 }
943 1085 /* FIXME: Need to prevent infinite looping here. */
944 if (task->tk_status == -EJUKEBOX) 1086 return -NFS4ERR_RESET_TO_PNFS;
945 nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); 1087out_retry:
946 task->tk_status = 0; 1088 task->tk_status = 0;
947 rpc_restart_call(task); 1089 rpc_restart_call(task);
948 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); 1090 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
@@ -972,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
972 1114
973static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, 1115static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
974 int idx, u64 offset, u64 length, 1116 int idx, u64 offset, u64 length,
975 u32 status, int opnum) 1117 u32 status, int opnum, int error)
976{ 1118{
977 struct nfs4_ff_layout_mirror *mirror; 1119 struct nfs4_ff_layout_mirror *mirror;
978 int err; 1120 int err;
979 1121
1122 if (status == 0) {
1123 switch (error) {
1124 case -ETIMEDOUT:
1125 case -EPFNOSUPPORT:
1126 case -EPROTONOSUPPORT:
1127 case -EOPNOTSUPP:
1128 case -ECONNREFUSED:
1129 case -ECONNRESET:
1130 case -EHOSTDOWN:
1131 case -EHOSTUNREACH:
1132 case -ENETUNREACH:
1133 case -EADDRINUSE:
1134 case -ENOBUFS:
1135 case -EPIPE:
1136 case -EPERM:
1137 status = NFS4ERR_NXIO;
1138 break;
1139 case -EACCES:
1140 status = NFS4ERR_ACCESS;
1141 break;
1142 default:
1143 return;
1144 }
1145 }
1146
980 mirror = FF_LAYOUT_COMP(lseg, idx); 1147 mirror = FF_LAYOUT_COMP(lseg, idx);
981 err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), 1148 err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
982 mirror, offset, length, status, opnum, 1149 mirror, offset, length, status, opnum,
983 GFP_NOIO); 1150 GFP_NOIO);
1151 pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
984 dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); 1152 dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
985} 1153}
986 1154
@@ -989,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
989static int ff_layout_read_done_cb(struct rpc_task *task, 1157static int ff_layout_read_done_cb(struct rpc_task *task,
990 struct nfs_pgio_header *hdr) 1158 struct nfs_pgio_header *hdr)
991{ 1159{
992 struct inode *inode;
993 int err; 1160 int err;
994 1161
995 trace_nfs4_pnfs_read(hdr, task->tk_status); 1162 trace_nfs4_pnfs_read(hdr, task->tk_status);
996 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) 1163 if (task->tk_status < 0)
997 hdr->res.op_status = NFS4ERR_NXIO;
998 if (task->tk_status < 0 && hdr->res.op_status)
999 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, 1164 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1000 hdr->args.offset, hdr->args.count, 1165 hdr->args.offset, hdr->args.count,
1001 hdr->res.op_status, OP_READ); 1166 hdr->res.op_status, OP_READ,
1167 task->tk_status);
1002 err = ff_layout_async_handle_error(task, hdr->args.context->state, 1168 err = ff_layout_async_handle_error(task, hdr->args.context->state,
1003 hdr->ds_clp, hdr->lseg, 1169 hdr->ds_clp, hdr->lseg,
1004 hdr->pgio_mirror_idx); 1170 hdr->pgio_mirror_idx);
@@ -1010,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
1010 pnfs_read_resend_pnfs(hdr); 1176 pnfs_read_resend_pnfs(hdr);
1011 return task->tk_status; 1177 return task->tk_status;
1012 case -NFS4ERR_RESET_TO_MDS: 1178 case -NFS4ERR_RESET_TO_MDS:
1013 inode = hdr->lseg->pls_layout->plh_inode;
1014 pnfs_error_mark_layout_for_return(inode, hdr->lseg);
1015 ff_layout_reset_read(hdr); 1179 ff_layout_reset_read(hdr);
1016 return task->tk_status; 1180 return task->tk_status;
1017 case -EAGAIN: 1181 case -EAGAIN:
@@ -1061,9 +1225,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
1061static int ff_layout_read_prepare_common(struct rpc_task *task, 1225static int ff_layout_read_prepare_common(struct rpc_task *task,
1062 struct nfs_pgio_header *hdr) 1226 struct nfs_pgio_header *hdr)
1063{ 1227{
1064 nfs4_ff_layout_stat_io_start_read( 1228 nfs4_ff_layout_stat_io_start_read(hdr->inode,
1065 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), 1229 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1066 hdr->args.count); 1230 hdr->args.count,
1231 task->tk_start);
1067 1232
1068 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { 1233 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1069 rpc_exit(task, -EIO); 1234 rpc_exit(task, -EIO);
@@ -1163,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
1163static int ff_layout_write_done_cb(struct rpc_task *task, 1328static int ff_layout_write_done_cb(struct rpc_task *task,
1164 struct nfs_pgio_header *hdr) 1329 struct nfs_pgio_header *hdr)
1165{ 1330{
1166 struct inode *inode;
1167 int err; 1331 int err;
1168 1332
1169 trace_nfs4_pnfs_write(hdr, task->tk_status); 1333 trace_nfs4_pnfs_write(hdr, task->tk_status);
1170 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) 1334 if (task->tk_status < 0)
1171 hdr->res.op_status = NFS4ERR_NXIO;
1172 if (task->tk_status < 0 && hdr->res.op_status)
1173 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, 1335 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1174 hdr->args.offset, hdr->args.count, 1336 hdr->args.offset, hdr->args.count,
1175 hdr->res.op_status, OP_WRITE); 1337 hdr->res.op_status, OP_WRITE,
1338 task->tk_status);
1176 err = ff_layout_async_handle_error(task, hdr->args.context->state, 1339 err = ff_layout_async_handle_error(task, hdr->args.context->state,
1177 hdr->ds_clp, hdr->lseg, 1340 hdr->ds_clp, hdr->lseg,
1178 hdr->pgio_mirror_idx); 1341 hdr->pgio_mirror_idx);
1179 1342
1180 switch (err) { 1343 switch (err) {
1181 case -NFS4ERR_RESET_TO_PNFS: 1344 case -NFS4ERR_RESET_TO_PNFS:
1345 pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
1346 ff_layout_reset_write(hdr, true);
1347 return task->tk_status;
1182 case -NFS4ERR_RESET_TO_MDS: 1348 case -NFS4ERR_RESET_TO_MDS:
1183 inode = hdr->lseg->pls_layout->plh_inode; 1349 pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
1184 pnfs_error_mark_layout_for_return(inode, hdr->lseg); 1350 ff_layout_reset_write(hdr, false);
1185 if (err == -NFS4ERR_RESET_TO_PNFS) {
1186 pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
1187 ff_layout_reset_write(hdr, true);
1188 } else {
1189 pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
1190 ff_layout_reset_write(hdr, false);
1191 }
1192 return task->tk_status; 1351 return task->tk_status;
1193 case -EAGAIN: 1352 case -EAGAIN:
1194 rpc_restart_call_prepare(task); 1353 rpc_restart_call_prepare(task);
@@ -1199,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
1199 hdr->res.verf->committed == NFS_DATA_SYNC) 1358 hdr->res.verf->committed == NFS_DATA_SYNC)
1200 ff_layout_set_layoutcommit(hdr); 1359 ff_layout_set_layoutcommit(hdr);
1201 1360
1361 /* zero out fattr since we don't care DS attr at all */
1362 hdr->fattr.valid = 0;
1363 if (task->tk_status >= 0)
1364 nfs_writeback_update_inode(hdr);
1365
1202 return 0; 1366 return 0;
1203} 1367}
1204 1368
1205static int ff_layout_commit_done_cb(struct rpc_task *task, 1369static int ff_layout_commit_done_cb(struct rpc_task *task,
1206 struct nfs_commit_data *data) 1370 struct nfs_commit_data *data)
1207{ 1371{
1208 struct inode *inode;
1209 int err; 1372 int err;
1210 1373
1211 trace_nfs4_pnfs_commit_ds(data, task->tk_status); 1374 trace_nfs4_pnfs_commit_ds(data, task->tk_status);
1212 if (task->tk_status == -ETIMEDOUT && !data->res.op_status) 1375 if (task->tk_status < 0)
1213 data->res.op_status = NFS4ERR_NXIO;
1214 if (task->tk_status < 0 && data->res.op_status)
1215 ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, 1376 ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
1216 data->args.offset, data->args.count, 1377 data->args.offset, data->args.count,
1217 data->res.op_status, OP_COMMIT); 1378 data->res.op_status, OP_COMMIT,
1379 task->tk_status);
1218 err = ff_layout_async_handle_error(task, NULL, data->ds_clp, 1380 err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
1219 data->lseg, data->ds_commit_index); 1381 data->lseg, data->ds_commit_index);
1220 1382
1221 switch (err) { 1383 switch (err) {
1222 case -NFS4ERR_RESET_TO_PNFS: 1384 case -NFS4ERR_RESET_TO_PNFS:
1385 pnfs_set_retry_layoutget(data->lseg->pls_layout);
1386 pnfs_generic_prepare_to_resend_writes(data);
1387 return -EAGAIN;
1223 case -NFS4ERR_RESET_TO_MDS: 1388 case -NFS4ERR_RESET_TO_MDS:
1224 inode = data->lseg->pls_layout->plh_inode; 1389 pnfs_clear_retry_layoutget(data->lseg->pls_layout);
1225 pnfs_error_mark_layout_for_return(inode, data->lseg);
1226 if (err == -NFS4ERR_RESET_TO_PNFS)
1227 pnfs_set_retry_layoutget(data->lseg->pls_layout);
1228 else
1229 pnfs_clear_retry_layoutget(data->lseg->pls_layout);
1230 pnfs_generic_prepare_to_resend_writes(data); 1390 pnfs_generic_prepare_to_resend_writes(data);
1231 return -EAGAIN; 1391 return -EAGAIN;
1232 case -EAGAIN: 1392 case -EAGAIN:
@@ -1244,9 +1404,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
1244static int ff_layout_write_prepare_common(struct rpc_task *task, 1404static int ff_layout_write_prepare_common(struct rpc_task *task,
1245 struct nfs_pgio_header *hdr) 1405 struct nfs_pgio_header *hdr)
1246{ 1406{
1247 nfs4_ff_layout_stat_io_start_write( 1407 nfs4_ff_layout_stat_io_start_write(hdr->inode,
1248 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), 1408 FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1249 hdr->args.count); 1409 hdr->args.count,
1410 task->tk_start);
1250 1411
1251 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { 1412 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1252 rpc_exit(task, -EIO); 1413 rpc_exit(task, -EIO);
@@ -1325,9 +1486,9 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
1325static void ff_layout_commit_prepare_common(struct rpc_task *task, 1486static void ff_layout_commit_prepare_common(struct rpc_task *task,
1326 struct nfs_commit_data *cdata) 1487 struct nfs_commit_data *cdata)
1327{ 1488{
1328 nfs4_ff_layout_stat_io_start_write( 1489 nfs4_ff_layout_stat_io_start_write(cdata->inode,
1329 FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), 1490 FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
1330 0); 1491 0, task->tk_start);
1331} 1492}
1332 1493
1333static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) 1494static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
@@ -1842,53 +2003,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
1842 *start = cpu_to_be32((xdr->p - start - 1) * 4); 2003 *start = cpu_to_be32((xdr->p - start - 1) * 4);
1843} 2004}
1844 2005
1845static bool 2006static int
1846ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, 2007ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
1847 struct pnfs_layout_segment *pls, 2008 struct pnfs_layout_hdr *lo,
1848 int *dev_count, int dev_limit) 2009 int dev_limit)
1849{ 2010{
2011 struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
1850 struct nfs4_ff_layout_mirror *mirror; 2012 struct nfs4_ff_layout_mirror *mirror;
1851 struct nfs4_deviceid_node *dev; 2013 struct nfs4_deviceid_node *dev;
1852 struct nfs42_layoutstat_devinfo *devinfo; 2014 struct nfs42_layoutstat_devinfo *devinfo;
1853 int i; 2015 int i = 0;
1854 2016
1855 for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) { 2017 list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
1856 if (*dev_count >= dev_limit) 2018 if (i >= dev_limit)
1857 break; 2019 break;
1858 mirror = FF_LAYOUT_COMP(pls, i); 2020 if (!mirror->mirror_ds)
1859 if (!mirror || !mirror->mirror_ds) 2021 continue;
2022 /* mirror refcount put in cleanup_layoutstats */
2023 if (!atomic_inc_not_zero(&mirror->ref))
1860 continue; 2024 continue;
1861 dev = FF_LAYOUT_DEVID_NODE(pls, i); 2025 dev = &mirror->mirror_ds->id_node;
1862 devinfo = &args->devinfo[*dev_count]; 2026 devinfo = &args->devinfo[i];
1863 memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); 2027 memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
1864 devinfo->offset = pls->pls_range.offset; 2028 devinfo->offset = 0;
1865 devinfo->length = pls->pls_range.length; 2029 devinfo->length = NFS4_MAX_UINT64;
1866 /* well, we don't really know if IO is continuous or not! */ 2030 devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
1867 devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
1868 devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; 2031 devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
1869 devinfo->write_count = mirror->write_stat.io_stat.bytes_completed; 2032 devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
1870 devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; 2033 devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
1871 devinfo->layout_type = LAYOUT_FLEX_FILES; 2034 devinfo->layout_type = LAYOUT_FLEX_FILES;
1872 devinfo->layoutstats_encode = ff_layout_encode_layoutstats; 2035 devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
1873 devinfo->layout_private = mirror; 2036 devinfo->layout_private = mirror;
1874 /* lseg refcount put in cleanup_layoutstats */
1875 pnfs_get_lseg(pls);
1876 2037
1877 ++(*dev_count); 2038 i++;
1878 } 2039 }
1879 2040 return i;
1880 return *dev_count < dev_limit;
1881} 2041}
1882 2042
1883static int 2043static int
1884ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) 2044ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
1885{ 2045{
1886 struct pnfs_layout_segment *pls; 2046 struct nfs4_flexfile_layout *ff_layout;
2047 struct nfs4_ff_layout_mirror *mirror;
1887 int dev_count = 0; 2048 int dev_count = 0;
1888 2049
1889 spin_lock(&args->inode->i_lock); 2050 spin_lock(&args->inode->i_lock);
1890 list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { 2051 ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
1891 dev_count += FF_LAYOUT_MIRROR_COUNT(pls); 2052 list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
2053 if (atomic_read(&mirror->ref) != 0)
2054 dev_count ++;
1892 } 2055 }
1893 spin_unlock(&args->inode->i_lock); 2056 spin_unlock(&args->inode->i_lock);
1894 /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ 2057 /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
@@ -1897,20 +2060,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
1897 __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); 2060 __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
1898 dev_count = PNFS_LAYOUTSTATS_MAXDEV; 2061 dev_count = PNFS_LAYOUTSTATS_MAXDEV;
1899 } 2062 }
1900 args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL); 2063 args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
1901 if (!args->devinfo) 2064 if (!args->devinfo)
1902 return -ENOMEM; 2065 return -ENOMEM;
1903 2066
1904 dev_count = 0;
1905 spin_lock(&args->inode->i_lock); 2067 spin_lock(&args->inode->i_lock);
1906 list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { 2068 args->num_dev = ff_layout_mirror_prepare_stats(args,
1907 if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count, 2069 &ff_layout->generic_hdr, dev_count);
1908 PNFS_LAYOUTSTATS_MAXDEV)) {
1909 break;
1910 }
1911 }
1912 spin_unlock(&args->inode->i_lock); 2070 spin_unlock(&args->inode->i_lock);
1913 args->num_dev = dev_count;
1914 2071
1915 return 0; 2072 return 0;
1916} 2073}
@@ -1924,7 +2081,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
1924 for (i = 0; i < data->args.num_dev; i++) { 2081 for (i = 0; i < data->args.num_dev; i++) {
1925 mirror = data->args.devinfo[i].layout_private; 2082 mirror = data->args.devinfo[i].layout_private;
1926 data->args.devinfo[i].layout_private = NULL; 2083 data->args.devinfo[i].layout_private = NULL;
1927 pnfs_put_lseg(mirror->lseg); 2084 ff_layout_put_mirror(mirror);
1928 } 2085 }
1929} 2086}
1930 2087
@@ -1936,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
1936 .free_layout_hdr = ff_layout_free_layout_hdr, 2093 .free_layout_hdr = ff_layout_free_layout_hdr,
1937 .alloc_lseg = ff_layout_alloc_lseg, 2094 .alloc_lseg = ff_layout_alloc_lseg,
1938 .free_lseg = ff_layout_free_lseg, 2095 .free_lseg = ff_layout_free_lseg,
2096 .add_lseg = ff_layout_add_lseg,
1939 .pg_read_ops = &ff_layout_pg_read_ops, 2097 .pg_read_ops = &ff_layout_pg_read_ops,
1940 .pg_write_ops = &ff_layout_pg_write_ops, 2098 .pg_write_ops = &ff_layout_pg_write_ops,
1941 .get_ds_info = ff_layout_get_ds_info, 2099 .get_ds_info = ff_layout_get_ds_info,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index f92f9a0a856b..68cc0d9828f9 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -67,7 +67,8 @@ struct nfs4_ff_layoutstat {
67}; 67};
68 68
69struct nfs4_ff_layout_mirror { 69struct nfs4_ff_layout_mirror {
70 struct pnfs_layout_segment *lseg; /* back pointer */ 70 struct pnfs_layout_hdr *layout;
71 struct list_head mirrors;
71 u32 ds_count; 72 u32 ds_count;
72 u32 efficiency; 73 u32 efficiency;
73 struct nfs4_ff_layout_ds *mirror_ds; 74 struct nfs4_ff_layout_ds *mirror_ds;
@@ -77,6 +78,7 @@ struct nfs4_ff_layout_mirror {
77 u32 uid; 78 u32 uid;
78 u32 gid; 79 u32 gid;
79 struct rpc_cred *cred; 80 struct rpc_cred *cred;
81 atomic_t ref;
80 spinlock_t lock; 82 spinlock_t lock;
81 struct nfs4_ff_layoutstat read_stat; 83 struct nfs4_ff_layoutstat read_stat;
82 struct nfs4_ff_layoutstat write_stat; 84 struct nfs4_ff_layoutstat write_stat;
@@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment {
95struct nfs4_flexfile_layout { 97struct nfs4_flexfile_layout {
96 struct pnfs_layout_hdr generic_hdr; 98 struct pnfs_layout_hdr generic_hdr;
97 struct pnfs_ds_commit_info commit_info; 99 struct pnfs_ds_commit_info commit_info;
100 struct list_head mirrors;
98 struct list_head error_list; /* nfs4_ff_layout_ds_err */ 101 struct list_head error_list; /* nfs4_ff_layout_ds_err */
99}; 102};
100 103
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f13e1969eedd..e125e55de86d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -172,6 +172,32 @@ out_err:
172 return NULL; 172 return NULL;
173} 173}
174 174
175static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
176 struct nfs4_deviceid_node *devid)
177{
178 nfs4_mark_deviceid_unavailable(devid);
179 if (!ff_layout_has_available_ds(lseg))
180 pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
181 lseg);
182}
183
184static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
185 struct nfs4_ff_layout_mirror *mirror)
186{
187 if (mirror == NULL || mirror->mirror_ds == NULL) {
188 pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
189 lseg);
190 return false;
191 }
192 if (mirror->mirror_ds->ds == NULL) {
193 struct nfs4_deviceid_node *devid;
194 devid = &mirror->mirror_ds->id_node;
195 ff_layout_mark_devid_invalid(lseg, devid);
196 return false;
197 }
198 return true;
199}
200
175static u64 201static u64
176end_offset(u64 start, u64 len) 202end_offset(u64 start, u64 len)
177{ 203{
@@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
336{ 362{
337 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); 363 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
338 struct nfs_fh *fh = NULL; 364 struct nfs_fh *fh = NULL;
339 struct nfs4_deviceid_node *devid;
340 365
341 if (mirror == NULL || mirror->mirror_ds == NULL || 366 if (!ff_layout_mirror_valid(lseg, mirror)) {
342 mirror->mirror_ds->ds == NULL) { 367 pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
343 printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
344 __func__, mirror_idx); 368 __func__, mirror_idx);
345 if (mirror && mirror->mirror_ds) {
346 devid = &mirror->mirror_ds->id_node;
347 pnfs_generic_mark_devid_invalid(devid);
348 }
349 goto out; 369 goto out;
350 } 370 }
351 371
@@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
368 unsigned int max_payload; 388 unsigned int max_payload;
369 rpc_authflavor_t flavor; 389 rpc_authflavor_t flavor;
370 390
371 if (mirror == NULL || mirror->mirror_ds == NULL || 391 if (!ff_layout_mirror_valid(lseg, mirror)) {
372 mirror->mirror_ds->ds == NULL) { 392 pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
373 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
374 __func__, ds_idx); 393 __func__, ds_idx);
375 if (mirror && mirror->mirror_ds) {
376 devid = &mirror->mirror_ds->id_node;
377 pnfs_generic_mark_devid_invalid(devid);
378 }
379 goto out; 394 goto out;
380 } 395 }
381 396
@@ -500,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
500 range->offset, range->length)) 515 range->offset, range->length))
501 continue; 516 continue;
502 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) 517 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
503 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4) 518 * + array length + deviceid(NFS4_DEVICEID4_SIZE)
519 * + status(4) + opnum(4)
504 */ 520 */
505 p = xdr_reserve_space(xdr, 521 p = xdr_reserve_space(xdr,
506 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); 522 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
507 if (unlikely(!p)) 523 if (unlikely(!p))
508 return -ENOBUFS; 524 return -ENOBUFS;
509 p = xdr_encode_hyper(p, err->offset); 525 p = xdr_encode_hyper(p, err->offset);
510 p = xdr_encode_hyper(p, err->length); 526 p = xdr_encode_hyper(p, err->length);
511 p = xdr_encode_opaque_fixed(p, &err->stateid, 527 p = xdr_encode_opaque_fixed(p, &err->stateid,
512 NFS4_STATEID_SIZE); 528 NFS4_STATEID_SIZE);
529 /* Encode 1 error */
530 *p++ = cpu_to_be32(1);
513 p = xdr_encode_opaque_fixed(p, &err->deviceid, 531 p = xdr_encode_opaque_fixed(p, &err->deviceid,
514 NFS4_DEVICEID4_SIZE); 532 NFS4_DEVICEID4_SIZE);
515 *p++ = cpu_to_be32(err->status); 533 *p++ = cpu_to_be32(err->status);
@@ -525,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
525 return 0; 543 return 0;
526} 544}
527 545
528bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) 546static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
529{ 547{
530 struct nfs4_ff_layout_mirror *mirror; 548 struct nfs4_ff_layout_mirror *mirror;
531 struct nfs4_deviceid_node *devid; 549 struct nfs4_deviceid_node *devid;
532 int idx; 550 u32 idx;
533 551
534 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { 552 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
535 mirror = FF_LAYOUT_COMP(lseg, idx); 553 mirror = FF_LAYOUT_COMP(lseg, idx);
@@ -543,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
543 return false; 561 return false;
544} 562}
545 563
564static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
565{
566 struct nfs4_ff_layout_mirror *mirror;
567 struct nfs4_deviceid_node *devid;
568 u32 idx;
569
570 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
571 mirror = FF_LAYOUT_COMP(lseg, idx);
572 if (!mirror || !mirror->mirror_ds)
573 return false;
574 devid = &mirror->mirror_ds->id_node;
575 if (ff_layout_test_devid_unavailable(devid))
576 return false;
577 }
578
579 return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
580}
581
582bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
583{
584 if (lseg->pls_range.iomode == IOMODE_READ)
585 return ff_read_layout_has_available_ds(lseg);
586 /* Note: RW layout needs all mirrors available */
587 return ff_rw_layout_has_available_ds(lseg);
588}
589
546module_param(dataserver_retrans, uint, 0644); 590module_param(dataserver_retrans, uint, 0644);
547MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " 591MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
548 "retries a request before it attempts further " 592 "retries a request before it attempts further "
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0adc7d245b3d..326d9e10d833 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -504,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
504{ 504{
505 struct inode *inode = d_inode(dentry); 505 struct inode *inode = d_inode(dentry);
506 struct nfs_fattr *fattr; 506 struct nfs_fattr *fattr;
507 int error = -ENOMEM; 507 int error = 0;
508 508
509 nfs_inc_stats(inode, NFSIOS_VFSSETATTR); 509 nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
510 510
@@ -513,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
513 attr->ia_valid &= ~ATTR_MODE; 513 attr->ia_valid &= ~ATTR_MODE;
514 514
515 if (attr->ia_valid & ATTR_SIZE) { 515 if (attr->ia_valid & ATTR_SIZE) {
516 loff_t i_size;
517
518 BUG_ON(!S_ISREG(inode->i_mode)); 516 BUG_ON(!S_ISREG(inode->i_mode));
519 517
520 i_size = i_size_read(inode); 518 error = inode_newsize_ok(inode, attr->ia_size);
521 if (attr->ia_size == i_size) 519 if (error)
520 return error;
521
522 if (attr->ia_size == i_size_read(inode))
522 attr->ia_valid &= ~ATTR_SIZE; 523 attr->ia_valid &= ~ATTR_SIZE;
523 else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
524 return -ETXTBSY;
525 } 524 }
526 525
527 /* Optimization: if the end result is no change, don't RPC */ 526 /* Optimization: if the end result is no change, don't RPC */
@@ -536,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
536 nfs_sync_inode(inode); 535 nfs_sync_inode(inode);
537 536
538 fattr = nfs_alloc_fattr(); 537 fattr = nfs_alloc_fattr();
539 if (fattr == NULL) 538 if (fattr == NULL) {
539 error = -ENOMEM;
540 goto out; 540 goto out;
541 }
542
541 /* 543 /*
542 * Return any delegations if we're going to change ACLs 544 * Return any delegations if we're going to change ACLs
543 */ 545 */
@@ -759,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context);
759 * @ctx: pointer to context 761 * @ctx: pointer to context
760 * @is_sync: is this a synchronous close 762 * @is_sync: is this a synchronous close
761 * 763 *
762 * always ensure that the attributes are up to date if we're mounted 764 * Ensure that the attributes are up to date if we're mounted
763 * with close-to-open semantics 765 * with close-to-open semantics and we have cached data that will
766 * need to be revalidated on open.
764 */ 767 */
765void nfs_close_context(struct nfs_open_context *ctx, int is_sync) 768void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
766{ 769{
770 struct nfs_inode *nfsi;
767 struct inode *inode; 771 struct inode *inode;
768 struct nfs_server *server; 772 struct nfs_server *server;
769 773
@@ -772,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
772 if (!is_sync) 776 if (!is_sync)
773 return; 777 return;
774 inode = d_inode(ctx->dentry); 778 inode = d_inode(ctx->dentry);
775 if (!list_empty(&NFS_I(inode)->open_files)) 779 nfsi = NFS_I(inode);
780 if (inode->i_mapping->nrpages == 0)
781 return;
782 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
783 return;
784 if (!list_empty(&nfsi->open_files))
776 return; 785 return;
777 server = NFS_SERVER(inode); 786 server = NFS_SERVER(inode);
778 if (server->flags & NFS_MOUNT_NOCTO) 787 if (server->flags & NFS_MOUNT_NOCTO)
@@ -844,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
844} 853}
845EXPORT_SYMBOL_GPL(put_nfs_open_context); 854EXPORT_SYMBOL_GPL(put_nfs_open_context);
846 855
856static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
857{
858 __put_nfs_open_context(ctx, 1);
859}
860
847/* 861/*
848 * Ensure that mmap has a recent RPC credential for use when writing out 862 * Ensure that mmap has a recent RPC credential for use when writing out
849 * shared pages 863 * shared pages
@@ -888,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
888 return ctx; 902 return ctx;
889} 903}
890 904
891static void nfs_file_clear_open_context(struct file *filp) 905void nfs_file_clear_open_context(struct file *filp)
892{ 906{
893 struct nfs_open_context *ctx = nfs_file_open_context(filp); 907 struct nfs_open_context *ctx = nfs_file_open_context(filp);
894 908
@@ -899,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp)
899 spin_lock(&inode->i_lock); 913 spin_lock(&inode->i_lock);
900 list_move_tail(&ctx->list, &NFS_I(inode)->open_files); 914 list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
901 spin_unlock(&inode->i_lock); 915 spin_unlock(&inode->i_lock);
902 __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); 916 put_nfs_open_context_sync(ctx);
903 } 917 }
904} 918}
905 919
@@ -919,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp)
919 return 0; 933 return 0;
920} 934}
921 935
922int nfs_release(struct inode *inode, struct file *filp)
923{
924 nfs_file_clear_open_context(filp);
925 return 0;
926}
927
928/* 936/*
929 * This function is called whenever some part of NFS notices that 937 * This function is called whenever some part of NFS notices that
930 * the cached attributes have to be refreshed. 938 * the cached attributes have to be refreshed.
@@ -1273,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1273 return 0; 1281 return 0;
1274} 1282}
1275 1283
1276static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
1277{
1278 if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
1279 return 0;
1280 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
1281}
1282
1283static atomic_long_t nfs_attr_generation_counter; 1284static atomic_long_t nfs_attr_generation_counter;
1284 1285
1285static unsigned long nfs_read_attr_generation_counter(void) 1286static unsigned long nfs_read_attr_generation_counter(void)
@@ -1428,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
1428 const struct nfs_inode *nfsi = NFS_I(inode); 1429 const struct nfs_inode *nfsi = NFS_I(inode);
1429 1430
1430 return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || 1431 return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
1431 nfs_ctime_need_update(inode, fattr) ||
1432 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); 1432 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
1433} 1433}
1434 1434
@@ -1491,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
1491{ 1491{
1492 unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1492 unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1493 1493
1494 /*
1495 * Don't revalidate the pagecache if we hold a delegation, but do
1496 * force an attribute update
1497 */
1498 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
1499 invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
1500
1494 if (S_ISDIR(inode->i_mode)) 1501 if (S_ISDIR(inode->i_mode))
1495 invalid |= NFS_INO_INVALID_DATA; 1502 invalid |= NFS_INO_INVALID_DATA;
1496 nfs_set_cache_invalid(inode, invalid); 1503 nfs_set_cache_invalid(inode, invalid);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9b372b845f6a..56cfde26fb9c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void)
219} 219}
220#endif 220#endif
221 221
222#ifdef CONFIG_NFS_V4_1
223int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
224#endif
225
226/* callback_xdr.c */ 222/* callback_xdr.c */
227extern struct svc_version nfs4_callback_version1; 223extern struct svc_version nfs4_callback_version1;
228extern struct svc_version nfs4_callback_version4; 224extern struct svc_version nfs4_callback_version4;
@@ -364,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
364/* file.c */ 360/* file.c */
365int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); 361int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
366loff_t nfs_file_llseek(struct file *, loff_t, int); 362loff_t nfs_file_llseek(struct file *, loff_t, int);
367int nfs_file_flush(struct file *, fl_owner_t);
368ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); 363ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
369ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, 364ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
370 size_t, unsigned int); 365 size_t, unsigned int);
@@ -490,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list,
490void nfs_commitdata_release(struct nfs_commit_data *data); 485void nfs_commitdata_release(struct nfs_commit_data *data);
491void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, 486void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
492 struct nfs_commit_info *cinfo); 487 struct nfs_commit_info *cinfo);
488void nfs_request_add_commit_list_locked(struct nfs_page *req,
489 struct list_head *dst,
490 struct nfs_commit_info *cinfo);
493void nfs_request_remove_commit_list(struct nfs_page *req, 491void nfs_request_remove_commit_list(struct nfs_page *req,
494 struct nfs_commit_info *cinfo); 492 struct nfs_commit_info *cinfo);
495void nfs_init_cinfo(struct nfs_commit_info *cinfo, 493void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -623,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
623 * Record the page as unstable and mark its inode as dirty. 621 * Record the page as unstable and mark its inode as dirty.
624 */ 622 */
625static inline 623static inline
626void nfs_mark_page_unstable(struct page *page) 624void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
627{ 625{
628 struct inode *inode = page_file_mapping(page)->host; 626 if (!cinfo->dreq) {
627 struct inode *inode = page_file_mapping(page)->host;
629 628
630 inc_zone_page_state(page, NR_UNSTABLE_NFS); 629 inc_zone_page_state(page, NR_UNSTABLE_NFS);
631 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); 630 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
632 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 631 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
632 }
633} 633}
634 634
635/* 635/*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e6fffc..267126d32ec0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
1103{ 1103{
1104 encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); 1104 encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
1105 encode_symlinkdata3(xdr, args); 1105 encode_symlinkdata3(xdr, args);
1106 xdr->buf->flags |= XDRBUF_WRITE;
1106} 1107}
1107 1108
1108/* 1109/*
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index ff66ae700b89..814c1255f1d2 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
17loff_t nfs42_proc_llseek(struct file *, loff_t, int); 17loff_t nfs42_proc_llseek(struct file *, loff_t, int);
18int nfs42_proc_layoutstats_generic(struct nfs_server *, 18int nfs42_proc_layoutstats_generic(struct nfs_server *,
19 struct nfs42_layoutstat_data *); 19 struct nfs42_layoutstat_data *);
20/* nfs4.2xdr.h */
21extern struct rpc_procinfo nfs4_2_procedures[];
22 20
23#endif /* __LINUX_FS_NFS_NFS4_2_H */ 21#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index a6bd27da6286..0eb29e14070d 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -238,8 +238,7 @@ out_overflow:
238 return -EIO; 238 return -EIO;
239} 239}
240 240
241static int decode_layoutstats(struct xdr_stream *xdr, 241static int decode_layoutstats(struct xdr_stream *xdr)
242 struct nfs42_layoutstat_res *res)
243{ 242{
244 return decode_op_hdr(xdr, OP_LAYOUTSTATS); 243 return decode_op_hdr(xdr, OP_LAYOUTSTATS);
245} 244}
@@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
343 goto out; 342 goto out;
344 WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV); 343 WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
345 for (i = 0; i < res->num_dev; i++) { 344 for (i = 0; i < res->num_dev; i++) {
346 status = decode_layoutstats(xdr, res); 345 status = decode_layoutstats(xdr);
347 if (status) 346 if (status)
348 goto out; 347 goto out;
349 } 348 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea3bee919a76..50cfc4ca7a02 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
405int nfs41_discover_server_trunking(struct nfs_client *clp, 405int nfs41_discover_server_trunking(struct nfs_client *clp,
406 struct nfs_client **, struct rpc_cred *); 406 struct nfs_client **, struct rpc_cred *);
407extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); 407extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
408extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); 408extern void nfs41_notify_server(struct nfs_client *);
409extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
410
411#else 409#else
412static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) 410static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
413{ 411{
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3aa6a9ba5113..223bedda64ae 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
729 return false; 729 return false;
730 730
731 /* Match only the IP address, not the port number */ 731 /* Match only the IP address, not the port number */
732 if (!nfs_sockaddr_match_ipaddr(addr, clap)) 732 return rpc_cmp_addr(addr, clap);
733 return false;
734
735 return true;
736} 733}
737 734
738/* 735/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index dcd39d4e2efe..b0dbe0abed53 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -6,7 +6,9 @@
6#include <linux/fs.h> 6#include <linux/fs.h>
7#include <linux/falloc.h> 7#include <linux/falloc.h>
8#include <linux/nfs_fs.h> 8#include <linux/nfs_fs.h>
9#include "delegation.h"
9#include "internal.h" 10#include "internal.h"
11#include "iostat.h"
10#include "fscache.h" 12#include "fscache.h"
11#include "pnfs.h" 13#include "pnfs.h"
12 14
@@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
27 struct inode *dir; 29 struct inode *dir;
28 unsigned openflags = filp->f_flags; 30 unsigned openflags = filp->f_flags;
29 struct iattr attr; 31 struct iattr attr;
30 int opened = 0;
31 int err; 32 int err;
32 33
33 /* 34 /*
@@ -66,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
66 nfs_sync_inode(inode); 67 nfs_sync_inode(inode);
67 } 68 }
68 69
69 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); 70 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
70 if (IS_ERR(inode)) { 71 if (IS_ERR(inode)) {
71 err = PTR_ERR(inode); 72 err = PTR_ERR(inode);
72 switch (err) { 73 switch (err) {
@@ -100,6 +101,31 @@ out_drop:
100 goto out_put_ctx; 101 goto out_put_ctx;
101} 102}
102 103
104/*
105 * Flush all dirty pages, and check for write errors.
106 */
107static int
108nfs4_file_flush(struct file *file, fl_owner_t id)
109{
110 struct inode *inode = file_inode(file);
111
112 dprintk("NFS: flush(%pD2)\n", file);
113
114 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
115 if ((file->f_mode & FMODE_WRITE) == 0)
116 return 0;
117
118 /*
119 * If we're holding a write delegation, then check if we're required
120 * to flush the i/o on close. If not, then just start the i/o now.
121 */
122 if (!nfs4_delegation_flush_on_close(inode))
123 return filemap_fdatawrite(file->f_mapping);
124
125 /* Flush writes to the server and return any errors */
126 return vfs_fsync(file, 0);
127}
128
103static int 129static int
104nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) 130nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
105{ 131{
@@ -178,7 +204,7 @@ const struct file_operations nfs4_file_operations = {
178 .write_iter = nfs_file_write, 204 .write_iter = nfs_file_write,
179 .mmap = nfs_file_mmap, 205 .mmap = nfs_file_mmap,
180 .open = nfs4_file_open, 206 .open = nfs4_file_open,
181 .flush = nfs_file_flush, 207 .flush = nfs4_file_flush,
182 .release = nfs_file_release, 208 .release = nfs_file_release,
183 .fsync = nfs4_file_fsync, 209 .fsync = nfs4_file_fsync,
184 .lock = nfs_lock, 210 .lock = nfs_lock,
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 535dfc69c628..2e4902203c35 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = {
184 .read = user_read, 184 .read = user_read,
185}; 185};
186 186
187static int nfs_idmap_init_keyring(void) 187int nfs_idmap_init(void)
188{ 188{
189 struct cred *cred; 189 struct cred *cred;
190 struct key *keyring; 190 struct key *keyring;
@@ -230,7 +230,7 @@ failed_put_cred:
230 return ret; 230 return ret;
231} 231}
232 232
233static void nfs_idmap_quit_keyring(void) 233void nfs_idmap_quit(void)
234{ 234{
235 key_revoke(id_resolver_cache->thread_keyring); 235 key_revoke(id_resolver_cache->thread_keyring);
236 unregister_key_type(&key_type_id_resolver); 236 unregister_key_type(&key_type_id_resolver);
@@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp)
492 kfree(idmap); 492 kfree(idmap);
493} 493}
494 494
495int nfs_idmap_init(void)
496{
497 return nfs_idmap_init_keyring();
498}
499
500void nfs_idmap_quit(void)
501{
502 nfs_idmap_quit_keyring();
503}
504
505static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, 495static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
506 struct idmap_msg *im, 496 struct idmap_msg *im,
507 struct rpc_pipe_msg *msg) 497 struct rpc_pipe_msg *msg)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3acb1eb72930..693b903b48bd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -586,7 +586,7 @@ out_unlock:
586 spin_unlock(&tbl->slot_tbl_lock); 586 spin_unlock(&tbl->slot_tbl_lock);
587 res->sr_slot = NULL; 587 res->sr_slot = NULL;
588 if (send_new_highest_used_slotid) 588 if (send_new_highest_used_slotid)
589 nfs41_server_notify_highest_slotid_update(session->clp); 589 nfs41_notify_server(session->clp);
590} 590}
591 591
592int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 592int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -1150,7 +1150,8 @@ out:
1150 return ret; 1150 return ret;
1151} 1151}
1152 1152
1153static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) 1153static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
1154 enum open_claim_type4 claim)
1154{ 1155{
1155 if (delegation == NULL) 1156 if (delegation == NULL)
1156 return 0; 1157 return 0;
@@ -1158,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
1158 return 0; 1159 return 0;
1159 if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) 1160 if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
1160 return 0; 1161 return 0;
1162 switch (claim) {
1163 case NFS4_OPEN_CLAIM_NULL:
1164 case NFS4_OPEN_CLAIM_FH:
1165 break;
1166 case NFS4_OPEN_CLAIM_PREVIOUS:
1167 if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
1168 break;
1169 default:
1170 return 0;
1171 }
1161 nfs_mark_delegation_referenced(delegation); 1172 nfs_mark_delegation_referenced(delegation);
1162 return 1; 1173 return 1;
1163} 1174}
@@ -1220,6 +1231,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
1220} 1231}
1221 1232
1222static void nfs_clear_open_stateid_locked(struct nfs4_state *state, 1233static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1234 nfs4_stateid *arg_stateid,
1223 nfs4_stateid *stateid, fmode_t fmode) 1235 nfs4_stateid *stateid, fmode_t fmode)
1224{ 1236{
1225 clear_bit(NFS_O_RDWR_STATE, &state->flags); 1237 clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1238,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1238 if (stateid == NULL) 1250 if (stateid == NULL)
1239 return; 1251 return;
1240 /* Handle races with OPEN */ 1252 /* Handle races with OPEN */
1241 if (!nfs4_stateid_match_other(stateid, &state->open_stateid) || 1253 if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) ||
1242 !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { 1254 (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
1255 !nfs4_stateid_is_newer(stateid, &state->open_stateid))) {
1243 nfs_resync_open_stateid_locked(state); 1256 nfs_resync_open_stateid_locked(state);
1244 return; 1257 return;
1245 } 1258 }
@@ -1248,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1248 nfs4_stateid_copy(&state->open_stateid, stateid); 1261 nfs4_stateid_copy(&state->open_stateid, stateid);
1249} 1262}
1250 1263
1251static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) 1264static void nfs_clear_open_stateid(struct nfs4_state *state,
1265 nfs4_stateid *arg_stateid,
1266 nfs4_stateid *stateid, fmode_t fmode)
1252{ 1267{
1253 write_seqlock(&state->seqlock); 1268 write_seqlock(&state->seqlock);
1254 nfs_clear_open_stateid_locked(state, stateid, fmode); 1269 nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode);
1255 write_sequnlock(&state->seqlock); 1270 write_sequnlock(&state->seqlock);
1256 if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) 1271 if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
1257 nfs4_schedule_state_manager(state->owner->so_server->nfs_client); 1272 nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
@@ -1376,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1376 struct nfs_delegation *delegation; 1391 struct nfs_delegation *delegation;
1377 int open_mode = opendata->o_arg.open_flags; 1392 int open_mode = opendata->o_arg.open_flags;
1378 fmode_t fmode = opendata->o_arg.fmode; 1393 fmode_t fmode = opendata->o_arg.fmode;
1394 enum open_claim_type4 claim = opendata->o_arg.claim;
1379 nfs4_stateid stateid; 1395 nfs4_stateid stateid;
1380 int ret = -EAGAIN; 1396 int ret = -EAGAIN;
1381 1397
@@ -1389,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1389 spin_unlock(&state->owner->so_lock); 1405 spin_unlock(&state->owner->so_lock);
1390 rcu_read_lock(); 1406 rcu_read_lock();
1391 delegation = rcu_dereference(nfsi->delegation); 1407 delegation = rcu_dereference(nfsi->delegation);
1392 if (!can_open_delegated(delegation, fmode)) { 1408 if (!can_open_delegated(delegation, fmode, claim)) {
1393 rcu_read_unlock(); 1409 rcu_read_unlock();
1394 break; 1410 break;
1395 } 1411 }
@@ -1852,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1852 struct nfs4_opendata *data = calldata; 1868 struct nfs4_opendata *data = calldata;
1853 struct nfs4_state_owner *sp = data->owner; 1869 struct nfs4_state_owner *sp = data->owner;
1854 struct nfs_client *clp = sp->so_server->nfs_client; 1870 struct nfs_client *clp = sp->so_server->nfs_client;
1871 enum open_claim_type4 claim = data->o_arg.claim;
1855 1872
1856 if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) 1873 if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
1857 goto out_wait; 1874 goto out_wait;
@@ -1866,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1866 goto out_no_action; 1883 goto out_no_action;
1867 rcu_read_lock(); 1884 rcu_read_lock();
1868 delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); 1885 delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
1869 if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && 1886 if (can_open_delegated(delegation, data->o_arg.fmode, claim))
1870 data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
1871 can_open_delegated(delegation, data->o_arg.fmode))
1872 goto unlock_no_action; 1887 goto unlock_no_action;
1873 rcu_read_unlock(); 1888 rcu_read_unlock();
1874 } 1889 }
1875 /* Update client id. */ 1890 /* Update client id. */
1876 data->o_arg.clientid = clp->cl_clientid; 1891 data->o_arg.clientid = clp->cl_clientid;
1877 switch (data->o_arg.claim) { 1892 switch (claim) {
1893 default:
1894 break;
1878 case NFS4_OPEN_CLAIM_PREVIOUS: 1895 case NFS4_OPEN_CLAIM_PREVIOUS:
1879 case NFS4_OPEN_CLAIM_DELEG_CUR_FH: 1896 case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
1880 case NFS4_OPEN_CLAIM_DELEG_PREV_FH: 1897 case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
@@ -2294,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
2294 * fields corresponding to attributes that were used to store the verifier. 2311 * fields corresponding to attributes that were used to store the verifier.
2295 * Make sure we clobber those fields in the later setattr call 2312 * Make sure we clobber those fields in the later setattr call
2296 */ 2313 */
2297static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr) 2314static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
2315 struct iattr *sattr, struct nfs4_label **label)
2298{ 2316{
2299 if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) && 2317 const u32 *attrset = opendata->o_res.attrset;
2318
2319 if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
2300 !(sattr->ia_valid & ATTR_ATIME_SET)) 2320 !(sattr->ia_valid & ATTR_ATIME_SET))
2301 sattr->ia_valid |= ATTR_ATIME; 2321 sattr->ia_valid |= ATTR_ATIME;
2302 2322
2303 if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) && 2323 if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
2304 !(sattr->ia_valid & ATTR_MTIME_SET)) 2324 !(sattr->ia_valid & ATTR_MTIME_SET))
2305 sattr->ia_valid |= ATTR_MTIME; 2325 sattr->ia_valid |= ATTR_MTIME;
2326
2327 /* Except MODE, it seems harmless of setting twice. */
2328 if ((attrset[1] & FATTR4_WORD1_MODE))
2329 sattr->ia_valid &= ~ATTR_MODE;
2330
2331 if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
2332 *label = NULL;
2306} 2333}
2307 2334
2308static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, 2335static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2425,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir,
2425 goto err_free_label; 2452 goto err_free_label;
2426 state = ctx->state; 2453 state = ctx->state;
2427 2454
2428 if ((opendata->o_arg.open_flags & O_EXCL) && 2455 if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
2429 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { 2456 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
2430 nfs4_exclusive_attrset(opendata, sattr); 2457 nfs4_exclusive_attrset(opendata, sattr, &label);
2431 2458
2432 nfs_fattr_init(opendata->o_res.f_attr); 2459 nfs_fattr_init(opendata->o_res.f_attr);
2433 status = nfs4_do_setattr(state->inode, cred, 2460 status = nfs4_do_setattr(state->inode, cred,
@@ -2439,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir,
2439 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); 2466 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
2440 } 2467 }
2441 } 2468 }
2442 if (opendata->file_created) 2469 if (opened && opendata->file_created)
2443 *opened |= FILE_CREATED; 2470 *opened |= FILE_CREATED;
2444 2471
2445 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { 2472 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
@@ -2661,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2661 switch (task->tk_status) { 2688 switch (task->tk_status) {
2662 case 0: 2689 case 0:
2663 res_stateid = &calldata->res.stateid; 2690 res_stateid = &calldata->res.stateid;
2664 if (calldata->arg.fmode == 0 && calldata->roc) 2691 if (calldata->roc)
2665 pnfs_roc_set_barrier(state->inode, 2692 pnfs_roc_set_barrier(state->inode,
2666 calldata->roc_barrier); 2693 calldata->roc_barrier);
2667 renew_lease(server, calldata->timestamp); 2694 renew_lease(server, calldata->timestamp);
@@ -2684,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2684 goto out_release; 2711 goto out_release;
2685 } 2712 }
2686 } 2713 }
2687 nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); 2714 nfs_clear_open_stateid(state, &calldata->arg.stateid,
2715 res_stateid, calldata->arg.fmode);
2688out_release: 2716out_release:
2689 nfs_release_seqid(calldata->arg.seqid); 2717 nfs_release_seqid(calldata->arg.seqid);
2690 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 2718 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2735,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2735 goto out_no_action; 2763 goto out_no_action;
2736 } 2764 }
2737 2765
2738 if (calldata->arg.fmode == 0) { 2766 if (calldata->arg.fmode == 0)
2739 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; 2767 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
2740 if (calldata->roc && 2768 if (calldata->roc)
2741 pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { 2769 pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
2742 nfs_release_seqid(calldata->arg.seqid); 2770
2743 goto out_wait;
2744 }
2745 }
2746 calldata->arg.share_access = 2771 calldata->arg.share_access =
2747 nfs4_map_atomic_open_share(NFS_SERVER(inode), 2772 nfs4_map_atomic_open_share(NFS_SERVER(inode),
2748 calldata->arg.fmode, 0); 2773 calldata->arg.fmode, 0);
@@ -2883,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
2883 2908
2884static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 2909static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
2885{ 2910{
2911 u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
2886 struct nfs4_server_caps_arg args = { 2912 struct nfs4_server_caps_arg args = {
2887 .fhandle = fhandle, 2913 .fhandle = fhandle,
2914 .bitmask = bitmask,
2888 }; 2915 };
2889 struct nfs4_server_caps_res res = {}; 2916 struct nfs4_server_caps_res res = {};
2890 struct rpc_message msg = { 2917 struct rpc_message msg = {
@@ -2894,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2894 }; 2921 };
2895 int status; 2922 int status;
2896 2923
2924 bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
2925 FATTR4_WORD0_FH_EXPIRE_TYPE |
2926 FATTR4_WORD0_LINK_SUPPORT |
2927 FATTR4_WORD0_SYMLINK_SUPPORT |
2928 FATTR4_WORD0_ACLSUPPORT;
2929 if (minorversion)
2930 bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
2931
2897 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2932 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2898 if (status == 0) { 2933 if (status == 0) {
2899 /* Sanity check the server answers */ 2934 /* Sanity check the server answers */
2900 switch (server->nfs_client->cl_minorversion) { 2935 switch (minorversion) {
2901 case 0: 2936 case 0:
2902 res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; 2937 res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
2903 res.attr_bitmask[2] = 0; 2938 res.attr_bitmask[2] = 0;
@@ -2950,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2950 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2985 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2951 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2986 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
2952 server->cache_consistency_bitmask[2] = 0; 2987 server->cache_consistency_bitmask[2] = 0;
2988 memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
2989 sizeof(server->exclcreat_bitmask));
2953 server->acl_bitmask = res.acl_bitmask; 2990 server->acl_bitmask = res.acl_bitmask;
2954 server->fh_expire_type = res.fh_expire_type; 2991 server->fh_expire_type = res.fh_expire_type;
2955 } 2992 }
@@ -3552,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3552 struct nfs4_label l, *ilabel = NULL; 3589 struct nfs4_label l, *ilabel = NULL;
3553 struct nfs_open_context *ctx; 3590 struct nfs_open_context *ctx;
3554 struct nfs4_state *state; 3591 struct nfs4_state *state;
3555 int opened = 0;
3556 int status = 0; 3592 int status = 0;
3557 3593
3558 ctx = alloc_nfs_open_context(dentry, FMODE_READ); 3594 ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3562,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3562 ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); 3598 ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
3563 3599
3564 sattr->ia_mode &= ~current_umask(); 3600 sattr->ia_mode &= ~current_umask();
3565 state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); 3601 state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
3566 if (IS_ERR(state)) { 3602 if (IS_ERR(state)) {
3567 status = PTR_ERR(state); 3603 status = PTR_ERR(state);
3568 goto out; 3604 goto out;
@@ -4978,13 +5014,12 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
4978 int result; 5014 int result;
4979 size_t len; 5015 size_t len;
4980 char *str; 5016 char *str;
4981 bool retried = false;
4982 5017
4983 if (clp->cl_owner_id != NULL) 5018 if (clp->cl_owner_id != NULL)
4984 return 0; 5019 return 0;
4985retry: 5020
4986 rcu_read_lock(); 5021 rcu_read_lock();
4987 len = 10 + strlen(clp->cl_ipaddr) + 1 + 5022 len = 14 + strlen(clp->cl_ipaddr) + 1 +
4988 strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + 5023 strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
4989 1 + 5024 1 +
4990 strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + 5025 strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
@@ -5010,14 +5045,6 @@ retry:
5010 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); 5045 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
5011 rcu_read_unlock(); 5046 rcu_read_unlock();
5012 5047
5013 /* Did something change? */
5014 if (result >= len) {
5015 kfree(str);
5016 if (retried)
5017 return -EINVAL;
5018 retried = true;
5019 goto retry;
5020 }
5021 clp->cl_owner_id = str; 5048 clp->cl_owner_id = str;
5022 return 0; 5049 return 0;
5023} 5050}
@@ -5049,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
5049 clp->rpc_ops->version, clp->cl_minorversion, 5076 clp->rpc_ops->version, clp->cl_minorversion,
5050 nfs4_client_id_uniquifier, 5077 nfs4_client_id_uniquifier,
5051 clp->cl_rpcclient->cl_nodename); 5078 clp->cl_rpcclient->cl_nodename);
5052 if (result >= len) {
5053 kfree(str);
5054 return -EINVAL;
5055 }
5056 clp->cl_owner_id = str; 5079 clp->cl_owner_id = str;
5057 return 0; 5080 return 0;
5058} 5081}
@@ -5088,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
5088 result = scnprintf(str, len, "Linux NFSv%u.%u %s", 5111 result = scnprintf(str, len, "Linux NFSv%u.%u %s",
5089 clp->rpc_ops->version, clp->cl_minorversion, 5112 clp->rpc_ops->version, clp->cl_minorversion,
5090 clp->cl_rpcclient->cl_nodename); 5113 clp->cl_rpcclient->cl_nodename);
5091 if (result >= len) {
5092 kfree(str);
5093 return -EINVAL;
5094 }
5095 clp->cl_owner_id = str; 5114 clp->cl_owner_id = str;
5096 return 0; 5115 return 0;
5097} 5116}
@@ -5289,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
5289 5308
5290 d_data = (struct nfs4_delegreturndata *)data; 5309 d_data = (struct nfs4_delegreturndata *)data;
5291 5310
5292 if (d_data->roc && 5311 if (d_data->roc)
5293 pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task)) 5312 pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
5294 return;
5295 5313
5296 nfs4_setup_sequence(d_data->res.server, 5314 nfs4_setup_sequence(d_data->res.server,
5297 &d_data->args.seq_args, 5315 &d_data->args.seq_args,
@@ -7746,10 +7764,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7746 case 0: 7764 case 0:
7747 goto out; 7765 goto out;
7748 /* 7766 /*
7767 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
7768 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
7769 */
7770 case -NFS4ERR_BADLAYOUT:
7771 goto out_overflow;
7772 /*
7749 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client 7773 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
7750 * (or clients) writing to the same RAID stripe 7774 * (or clients) writing to the same RAID stripe except when
7775 * the minlength argument is 0 (see RFC5661 section 18.43.3).
7751 */ 7776 */
7752 case -NFS4ERR_LAYOUTTRYLATER: 7777 case -NFS4ERR_LAYOUTTRYLATER:
7778 if (lgp->args.minlength == 0)
7779 goto out_overflow;
7753 /* 7780 /*
7754 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall 7781 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
7755 * existing layout before getting a new one). 7782 * existing layout before getting a new one).
@@ -7805,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7805 rpc_restart_call_prepare(task); 7832 rpc_restart_call_prepare(task);
7806out: 7833out:
7807 dprintk("<-- %s\n", __func__); 7834 dprintk("<-- %s\n", __func__);
7835 return;
7836out_overflow:
7837 task->tk_status = -EOVERFLOW;
7838 goto out;
7808} 7839}
7809 7840
7810static size_t max_response_pages(struct nfs_server *server) 7841static size_t max_response_pages(struct nfs_server *server)
@@ -8661,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
8661 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 8692 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
8662 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 8693 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
8663 .state_renewal_ops = &nfs41_state_renewal_ops, 8694 .state_renewal_ops = &nfs41_state_renewal_ops,
8695 .mig_recovery_ops = &nfs41_mig_recovery_ops,
8664}; 8696};
8665#endif 8697#endif
8666 8698
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f2e2ad894461..da73bc443238 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
2152} 2152}
2153EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); 2153EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
2154 2154
2155static void nfs41_ping_server(struct nfs_client *clp) 2155void nfs41_notify_server(struct nfs_client *clp)
2156{ 2156{
2157 /* Use CHECK_LEASE to ping the server with a SEQUENCE */ 2157 /* Use CHECK_LEASE to ping the server with a SEQUENCE */
2158 set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 2158 set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
2159 nfs4_schedule_state_manager(clp); 2159 nfs4_schedule_state_manager(clp);
2160} 2160}
2161 2161
2162void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
2163{
2164 nfs41_ping_server(clp);
2165}
2166
2167void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
2168{
2169 nfs41_ping_server(clp);
2170}
2171
2172static void nfs4_reset_all_state(struct nfs_client *clp) 2162static void nfs4_reset_all_state(struct nfs_client *clp)
2173{ 2163{
2174 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { 2164 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 470af1a78bec..28df12e525ba 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
884DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); 884DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
885DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); 885DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
886 886
887DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
888 TP_PROTO(
889 const struct nfs_client *clp,
890 const struct nfs_fh *fhandle,
891 const struct inode *inode,
892 int error
893 ),
894
895 TP_ARGS(clp, fhandle, inode, error),
896
897 TP_STRUCT__entry(
898 __field(int, error)
899 __field(dev_t, dev)
900 __field(u32, fhandle)
901 __field(u64, fileid)
902 __string(dstaddr, clp ?
903 rpc_peeraddr2str(clp->cl_rpcclient,
904 RPC_DISPLAY_ADDR) : "unknown")
905 ),
906
907 TP_fast_assign(
908 __entry->error = error;
909 __entry->fhandle = nfs_fhandle_hash(fhandle);
910 if (inode != NULL) {
911 __entry->fileid = NFS_FILEID(inode);
912 __entry->dev = inode->i_sb->s_dev;
913 } else {
914 __entry->fileid = 0;
915 __entry->dev = 0;
916 }
917 __assign_str(dstaddr, clp ?
918 rpc_peeraddr2str(clp->cl_rpcclient,
919 RPC_DISPLAY_ADDR) : "unknown")
920 ),
921
922 TP_printk(
923 "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
924 "dstaddr=%s",
925 __entry->error,
926 show_nfsv4_errors(__entry->error),
927 MAJOR(__entry->dev), MINOR(__entry->dev),
928 (unsigned long long)__entry->fileid,
929 __entry->fhandle,
930 __get_str(dstaddr)
931 )
932);
933
934#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
935 DEFINE_EVENT(nfs4_inode_callback_event, name, \
936 TP_PROTO( \
937 const struct nfs_client *clp, \
938 const struct nfs_fh *fhandle, \
939 const struct inode *inode, \
940 int error \
941 ), \
942 TP_ARGS(clp, fhandle, inode, error))
943DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
944DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
945
946
887DECLARE_EVENT_CLASS(nfs4_idmap_event, 947DECLARE_EVENT_CLASS(nfs4_idmap_event,
888 TP_PROTO( 948 TP_PROTO(
889 const char *name, 949 const char *name,
@@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget,
1136 1196
1137DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); 1197DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
1138DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); 1198DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
1199DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
1139 1200
1140#endif /* CONFIG_NFS_V4_1 */ 1201#endif /* CONFIG_NFS_V4_1 */
1141 1202
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65dbdb7..788adf3897c7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int);
400#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 400#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
401#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ 401#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
402 encode_stateid_maxsz + \ 402 encode_stateid_maxsz + \
403 1 /* FIXME: opaque lrf_body always empty at the moment */) 403 1 + \
404 XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
404#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 405#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
405 1 + decode_stateid_maxsz) 406 1 + decode_stateid_maxsz)
406#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) 407#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
@@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
1001 1002
1002static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, 1003static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
1003 const struct nfs4_label *label, 1004 const struct nfs4_label *label,
1004 const struct nfs_server *server) 1005 const struct nfs_server *server,
1006 bool excl_check)
1005{ 1007{
1006 char owner_name[IDMAP_NAMESZ]; 1008 char owner_name[IDMAP_NAMESZ];
1007 char owner_group[IDMAP_NAMESZ]; 1009 char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
1067 bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; 1069 bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
1068 len += 4; 1070 len += 4;
1069 } 1071 }
1072
1073 if (excl_check) {
1074 const u32 *excl_bmval = server->exclcreat_bitmask;
1075 bmval[0] &= excl_bmval[0];
1076 bmval[1] &= excl_bmval[1];
1077 bmval[2] &= excl_bmval[2];
1078
1079 if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
1080 label = NULL;
1081 }
1082
1070 if (label) { 1083 if (label) {
1071 len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); 1084 len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
1072 bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; 1085 bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
1154 case NF4LNK: 1167 case NF4LNK:
1155 p = reserve_space(xdr, 4); 1168 p = reserve_space(xdr, 4);
1156 *p = cpu_to_be32(create->u.symlink.len); 1169 *p = cpu_to_be32(create->u.symlink.len);
1157 xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); 1170 xdr_write_pages(xdr, create->u.symlink.pages, 0,
1171 create->u.symlink.len);
1172 xdr->buf->flags |= XDRBUF_WRITE;
1158 break; 1173 break;
1159 1174
1160 case NF4BLK: case NF4CHR: 1175 case NF4BLK: case NF4CHR:
@@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
1168 } 1183 }
1169 1184
1170 encode_string(xdr, create->name->len, create->name->name); 1185 encode_string(xdr, create->name->len, create->name->name);
1171 encode_attrs(xdr, create->attrs, create->label, create->server); 1186 encode_attrs(xdr, create->attrs, create->label, create->server, false);
1172} 1187}
1173 1188
1174static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) 1189static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1382 1397
1383static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1398static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
1384{ 1399{
1385 struct iattr dummy;
1386 __be32 *p; 1400 __be32 *p;
1387 1401
1388 p = reserve_space(xdr, 4); 1402 p = reserve_space(xdr, 4);
1389 switch(arg->createmode) { 1403 switch(arg->createmode) {
1390 case NFS4_CREATE_UNCHECKED: 1404 case NFS4_CREATE_UNCHECKED:
1391 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); 1405 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
1392 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); 1406 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
1393 break; 1407 break;
1394 case NFS4_CREATE_GUARDED: 1408 case NFS4_CREATE_GUARDED:
1395 *p = cpu_to_be32(NFS4_CREATE_GUARDED); 1409 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1396 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); 1410 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
1397 break; 1411 break;
1398 case NFS4_CREATE_EXCLUSIVE: 1412 case NFS4_CREATE_EXCLUSIVE:
1399 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); 1413 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1402 case NFS4_CREATE_EXCLUSIVE4_1: 1416 case NFS4_CREATE_EXCLUSIVE4_1:
1403 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); 1417 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
1404 encode_nfs4_verifier(xdr, &arg->u.verifier); 1418 encode_nfs4_verifier(xdr, &arg->u.verifier);
1405 dummy.ia_valid = 0; 1419 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
1406 encode_attrs(xdr, &dummy, arg->label, arg->server);
1407 } 1420 }
1408} 1421}
1409 1422
@@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1659{ 1672{
1660 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); 1673 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
1661 encode_nfs4_stateid(xdr, &arg->stateid); 1674 encode_nfs4_stateid(xdr, &arg->stateid);
1662 encode_attrs(xdr, arg->iap, arg->label, server); 1675 encode_attrs(xdr, arg->iap, arg->label, server, false);
1663} 1676}
1664 1677
1665static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) 1678static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
2580 struct xdr_stream *xdr, 2593 struct xdr_stream *xdr,
2581 struct nfs4_server_caps_arg *args) 2594 struct nfs4_server_caps_arg *args)
2582{ 2595{
2596 const u32 *bitmask = args->bitmask;
2583 struct compound_hdr hdr = { 2597 struct compound_hdr hdr = {
2584 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2598 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2585 }; 2599 };
@@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
2587 encode_compound_hdr(xdr, req, &hdr); 2601 encode_compound_hdr(xdr, req, &hdr);
2588 encode_sequence(xdr, &args->seq_args, &hdr); 2602 encode_sequence(xdr, &args->seq_args, &hdr);
2589 encode_putfh(xdr, args->fhandle, &hdr); 2603 encode_putfh(xdr, args->fhandle, &hdr);
2590 encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| 2604 encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
2591 FATTR4_WORD0_FH_EXPIRE_TYPE|
2592 FATTR4_WORD0_LINK_SUPPORT|
2593 FATTR4_WORD0_SYMLINK_SUPPORT|
2594 FATTR4_WORD0_ACLSUPPORT, &hdr);
2595 encode_nops(&hdr); 2605 encode_nops(&hdr);
2596} 2606}
2597 2607
@@ -3368,6 +3378,22 @@ out_overflow:
3368 return -EIO; 3378 return -EIO;
3369} 3379}
3370 3380
3381static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
3382 uint32_t *bitmap, uint32_t *bitmask)
3383{
3384 if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
3385 int ret;
3386 ret = decode_attr_bitmap(xdr, bitmask);
3387 if (unlikely(ret < 0))
3388 return ret;
3389 bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
3390 } else
3391 bitmask[0] = bitmask[1] = bitmask[2] = 0;
3392 dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
3393 bitmask[0], bitmask[1], bitmask[2]);
3394 return 0;
3395}
3396
3371static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) 3397static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
3372{ 3398{
3373 __be32 *p; 3399 __be32 *p;
@@ -4321,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
4321 goto xdr_error; 4347 goto xdr_error;
4322 if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) 4348 if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
4323 goto xdr_error; 4349 goto xdr_error;
4350 if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
4351 res->exclcreat_bitmask)) != 0)
4352 goto xdr_error;
4324 status = verify_attr_len(xdr, savep, attrlen); 4353 status = verify_attr_len(xdr, savep, attrlen);
4325xdr_error: 4354xdr_error:
4326 dprintk("%s: xdr returned %d!\n", __func__, -status); 4355 dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -4903,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr)
4903} 4932}
4904 4933
4905/* This is too sick! */ 4934/* This is too sick! */
4906static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) 4935static int decode_space_limit(struct xdr_stream *xdr,
4936 unsigned long *pagemod_limit)
4907{ 4937{
4908 __be32 *p; 4938 __be32 *p;
4909 uint32_t limit_type, nblocks, blocksize; 4939 uint32_t limit_type, nblocks, blocksize;
4940 u64 maxsize = 0;
4910 4941
4911 p = xdr_inline_decode(xdr, 12); 4942 p = xdr_inline_decode(xdr, 12);
4912 if (unlikely(!p)) 4943 if (unlikely(!p))
4913 goto out_overflow; 4944 goto out_overflow;
4914 limit_type = be32_to_cpup(p++); 4945 limit_type = be32_to_cpup(p++);
4915 switch (limit_type) { 4946 switch (limit_type) {
4916 case 1: 4947 case NFS4_LIMIT_SIZE:
4917 xdr_decode_hyper(p, maxsize); 4948 xdr_decode_hyper(p, &maxsize);
4918 break; 4949 break;
4919 case 2: 4950 case NFS4_LIMIT_BLOCKS:
4920 nblocks = be32_to_cpup(p++); 4951 nblocks = be32_to_cpup(p++);
4921 blocksize = be32_to_cpup(p); 4952 blocksize = be32_to_cpup(p);
4922 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; 4953 maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
4923 } 4954 }
4955 maxsize >>= PAGE_CACHE_SHIFT;
4956 *pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
4924 return 0; 4957 return 0;
4925out_overflow: 4958out_overflow:
4926 print_overflow_msg(__func__, xdr); 4959 print_overflow_msg(__func__, xdr);
@@ -4948,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
4948 break; 4981 break;
4949 case NFS4_OPEN_DELEGATE_WRITE: 4982 case NFS4_OPEN_DELEGATE_WRITE:
4950 res->delegation_type = FMODE_WRITE|FMODE_READ; 4983 res->delegation_type = FMODE_WRITE|FMODE_READ;
4951 if (decode_space_limit(xdr, &res->maxsize) < 0) 4984 if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
4952 return -EIO; 4985 return -EIO;
4953 } 4986 }
4954 return decode_ace(xdr, NULL, res->server->nfs_client); 4987 return decode_ace(xdr, NULL, res->server->nfs_client);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4984bbe55ff1..7c5718ba625e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
77void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) 77void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
78{ 78{
79 spin_lock(&hdr->lock); 79 spin_lock(&hdr->lock);
80 if (pos < hdr->io_start + hdr->good_bytes) { 80 if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
81 set_bit(NFS_IOHDR_ERROR, &hdr->flags); 81 || pos < hdr->io_start + hdr->good_bytes) {
82 clear_bit(NFS_IOHDR_EOF, &hdr->flags); 82 clear_bit(NFS_IOHDR_EOF, &hdr->flags);
83 hdr->good_bytes = pos - hdr->io_start; 83 hdr->good_bytes = pos - hdr->io_start;
84 hdr->error = error; 84 hdr->error = error;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70bf706b1090..ba1246433794 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
368 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) 368 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
369 return false; 369 return false;
370 lo->plh_return_iomode = 0; 370 lo->plh_return_iomode = 0;
371 lo->plh_block_lgets++;
372 pnfs_get_layout_hdr(lo); 371 pnfs_get_layout_hdr(lo);
373 clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); 372 clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
374 return true; 373 return true;
@@ -817,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
817 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 816 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
818} 817}
819 818
820static bool
821pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
822 struct pnfs_layout_range *range)
823{
824 return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
825 (lo->plh_return_iomode == IOMODE_ANY ||
826 lo->plh_return_iomode == range->iomode);
827}
828
829/* lget is set to 1 if called from inside send_layoutget call chain */ 819/* lget is set to 1 if called from inside send_layoutget call chain */
830static bool 820static bool
831pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, 821pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
832 struct pnfs_layout_range *range, int lget)
833{ 822{
834 return lo->plh_block_lgets || 823 return lo->plh_block_lgets ||
835 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 824 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
836 (list_empty(&lo->plh_segs) &&
837 (atomic_read(&lo->plh_outstanding) > lget)) ||
838 pnfs_layout_returning(lo, range);
839} 825}
840 826
841int 827int
@@ -847,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
847 833
848 dprintk("--> %s\n", __func__); 834 dprintk("--> %s\n", __func__);
849 spin_lock(&lo->plh_inode->i_lock); 835 spin_lock(&lo->plh_inode->i_lock);
850 if (pnfs_layoutgets_blocked(lo, range, 1)) { 836 if (pnfs_layoutgets_blocked(lo)) {
851 status = -EAGAIN; 837 status = -EAGAIN;
852 } else if (!nfs4_valid_open_stateid(open_state)) { 838 } else if (!nfs4_valid_open_stateid(open_state)) {
853 status = -EBADF; 839 status = -EBADF;
@@ -882,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
882 struct nfs_server *server = NFS_SERVER(ino); 868 struct nfs_server *server = NFS_SERVER(ino);
883 struct nfs4_layoutget *lgp; 869 struct nfs4_layoutget *lgp;
884 struct pnfs_layout_segment *lseg; 870 struct pnfs_layout_segment *lseg;
871 loff_t i_size;
885 872
886 dprintk("--> %s\n", __func__); 873 dprintk("--> %s\n", __func__);
887 874
@@ -889,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo,
889 if (lgp == NULL) 876 if (lgp == NULL)
890 return NULL; 877 return NULL;
891 878
879 i_size = i_size_read(ino);
880
892 lgp->args.minlength = PAGE_CACHE_SIZE; 881 lgp->args.minlength = PAGE_CACHE_SIZE;
893 if (lgp->args.minlength > range->length) 882 if (lgp->args.minlength > range->length)
894 lgp->args.minlength = range->length; 883 lgp->args.minlength = range->length;
884 if (range->iomode == IOMODE_READ) {
885 if (range->offset >= i_size)
886 lgp->args.minlength = 0;
887 else if (i_size - range->offset < lgp->args.minlength)
888 lgp->args.minlength = i_size - range->offset;
889 }
895 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 890 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
896 lgp->args.range = *range; 891 lgp->args.range = *range;
897 lgp->args.type = server->pnfs_curr_ld->id; 892 lgp->args.type = server->pnfs_curr_ld->id;
@@ -956,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
956 if (unlikely(lrp == NULL)) { 951 if (unlikely(lrp == NULL)) {
957 status = -ENOMEM; 952 status = -ENOMEM;
958 spin_lock(&ino->i_lock); 953 spin_lock(&ino->i_lock);
959 lo->plh_block_lgets--;
960 pnfs_clear_layoutreturn_waitbit(lo); 954 pnfs_clear_layoutreturn_waitbit(lo);
961 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
962 spin_unlock(&ino->i_lock); 955 spin_unlock(&ino->i_lock);
963 pnfs_put_layout_hdr(lo); 956 pnfs_put_layout_hdr(lo);
964 goto out; 957 goto out;
@@ -1080,15 +1073,14 @@ bool pnfs_roc(struct inode *ino)
1080 struct pnfs_layout_segment *lseg, *tmp; 1073 struct pnfs_layout_segment *lseg, *tmp;
1081 nfs4_stateid stateid; 1074 nfs4_stateid stateid;
1082 LIST_HEAD(tmp_list); 1075 LIST_HEAD(tmp_list);
1083 bool found = false, layoutreturn = false; 1076 bool found = false, layoutreturn = false, roc = false;
1084 1077
1085 spin_lock(&ino->i_lock); 1078 spin_lock(&ino->i_lock);
1086 lo = nfsi->layout; 1079 lo = nfsi->layout;
1087 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 1080 if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
1088 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
1089 goto out_noroc; 1081 goto out_noroc;
1090 1082
1091 /* Don't return layout if we hold a delegation */ 1083 /* no roc if we hold a delegation */
1092 if (nfs4_check_delegation(ino, FMODE_READ)) 1084 if (nfs4_check_delegation(ino, FMODE_READ))
1093 goto out_noroc; 1085 goto out_noroc;
1094 1086
@@ -1099,34 +1091,41 @@ bool pnfs_roc(struct inode *ino)
1099 goto out_noroc; 1091 goto out_noroc;
1100 } 1092 }
1101 1093
1094 stateid = lo->plh_stateid;
1095 /* always send layoutreturn if being marked so */
1096 if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1097 &lo->plh_flags))
1098 layoutreturn = pnfs_prepare_layoutreturn(lo);
1099
1102 pnfs_clear_retry_layoutget(lo); 1100 pnfs_clear_retry_layoutget(lo);
1103 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1101 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
1104 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 1102 /* If we are sending layoutreturn, invalidate all valid lsegs */
1103 if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
1105 mark_lseg_invalid(lseg, &tmp_list); 1104 mark_lseg_invalid(lseg, &tmp_list);
1106 found = true; 1105 found = true;
1107 } 1106 }
1108 if (!found) 1107 /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put
1109 goto out_noroc; 1108 * in pnfs_roc_release(). We don't really send a layoutreturn but
1110 lo->plh_block_lgets++; 1109 * still want others to view us like we are sending one!
1111 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 1110 *
1112 spin_unlock(&ino->i_lock); 1111 * If pnfs_prepare_layoutreturn() fails, it means someone else is doing
1113 pnfs_free_lseg_list(&tmp_list); 1112 * LAYOUTRETURN, so we proceed like there are no layouts to return.
1114 pnfs_layoutcommit_inode(ino, true); 1113 *
1115 return true; 1114 * ROC in three conditions:
1115 * 1. there are ROC lsegs
1116 * 2. we don't send layoutreturn
1117 * 3. no others are sending layoutreturn
1118 */
1119 if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo))
1120 roc = true;
1116 1121
1117out_noroc: 1122out_noroc:
1118 if (lo) {
1119 stateid = lo->plh_stateid;
1120 if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1121 &lo->plh_flags))
1122 layoutreturn = pnfs_prepare_layoutreturn(lo);
1123 }
1124 spin_unlock(&ino->i_lock); 1123 spin_unlock(&ino->i_lock);
1125 if (layoutreturn) { 1124 pnfs_free_lseg_list(&tmp_list);
1126 pnfs_layoutcommit_inode(ino, true); 1125 pnfs_layoutcommit_inode(ino, true);
1126 if (layoutreturn)
1127 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); 1127 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
1128 } 1128 return roc;
1129 return false;
1130} 1129}
1131 1130
1132void pnfs_roc_release(struct inode *ino) 1131void pnfs_roc_release(struct inode *ino)
@@ -1135,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino)
1135 1134
1136 spin_lock(&ino->i_lock); 1135 spin_lock(&ino->i_lock);
1137 lo = NFS_I(ino)->layout; 1136 lo = NFS_I(ino)->layout;
1138 lo->plh_block_lgets--; 1137 pnfs_clear_layoutreturn_waitbit(lo);
1139 if (atomic_dec_and_test(&lo->plh_refcount)) { 1138 if (atomic_dec_and_test(&lo->plh_refcount)) {
1140 pnfs_detach_layout_hdr(lo); 1139 pnfs_detach_layout_hdr(lo);
1141 spin_unlock(&ino->i_lock); 1140 spin_unlock(&ino->i_lock);
@@ -1153,27 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
1153 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) 1152 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
1154 lo->plh_barrier = barrier; 1153 lo->plh_barrier = barrier;
1155 spin_unlock(&ino->i_lock); 1154 spin_unlock(&ino->i_lock);
1155 trace_nfs4_layoutreturn_on_close(ino, 0);
1156} 1156}
1157 1157
1158bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) 1158void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
1159{ 1159{
1160 struct nfs_inode *nfsi = NFS_I(ino); 1160 struct nfs_inode *nfsi = NFS_I(ino);
1161 struct pnfs_layout_hdr *lo; 1161 struct pnfs_layout_hdr *lo;
1162 struct pnfs_layout_segment *lseg;
1163 nfs4_stateid stateid;
1164 u32 current_seqid; 1162 u32 current_seqid;
1165 bool layoutreturn = false;
1166 1163
1167 spin_lock(&ino->i_lock); 1164 spin_lock(&ino->i_lock);
1168 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
1169 if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
1170 continue;
1171 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
1172 continue;
1173 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1174 spin_unlock(&ino->i_lock);
1175 return true;
1176 }
1177 lo = nfsi->layout; 1165 lo = nfsi->layout;
1178 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); 1166 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
1179 1167
@@ -1181,19 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1181 * a barrier, we choose the worst-case barrier. 1169 * a barrier, we choose the worst-case barrier.
1182 */ 1170 */
1183 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 1171 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1184 stateid = lo->plh_stateid;
1185 if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1186 &lo->plh_flags))
1187 layoutreturn = pnfs_prepare_layoutreturn(lo);
1188 if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
1189 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1190
1191 spin_unlock(&ino->i_lock); 1172 spin_unlock(&ino->i_lock);
1192 if (layoutreturn) {
1193 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
1194 return true;
1195 }
1196 return false;
1197} 1173}
1198 1174
1199/* 1175/*
@@ -1221,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1221 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); 1197 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1222} 1198}
1223 1199
1224static void 1200static bool
1225pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, 1201pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1226 struct pnfs_layout_segment *lseg) 1202 const struct pnfs_layout_range *l2)
1227{ 1203{
1228 struct pnfs_layout_segment *lp; 1204 return pnfs_lseg_range_cmp(l1, l2) > 0;
1205}
1206
1207static bool
1208pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1209 struct pnfs_layout_segment *old)
1210{
1211 return false;
1212}
1213
1214void
1215pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1216 struct pnfs_layout_segment *lseg,
1217 bool (*is_after)(const struct pnfs_layout_range *,
1218 const struct pnfs_layout_range *),
1219 bool (*do_merge)(struct pnfs_layout_segment *,
1220 struct pnfs_layout_segment *),
1221 struct list_head *free_me)
1222{
1223 struct pnfs_layout_segment *lp, *tmp;
1229 1224
1230 dprintk("%s:Begin\n", __func__); 1225 dprintk("%s:Begin\n", __func__);
1231 1226
1232 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 1227 list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1233 if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) 1228 if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1229 continue;
1230 if (do_merge(lseg, lp)) {
1231 mark_lseg_invalid(lp, free_me);
1232 continue;
1233 }
1234 if (is_after(&lseg->pls_range, &lp->pls_range))
1234 continue; 1235 continue;
1235 list_add_tail(&lseg->pls_list, &lp->pls_list); 1236 list_add_tail(&lseg->pls_list, &lp->pls_list);
1236 dprintk("%s: inserted lseg %p " 1237 dprintk("%s: inserted lseg %p "
@@ -1252,6 +1253,24 @@ out:
1252 1253
1253 dprintk("%s:Return\n", __func__); 1254 dprintk("%s:Return\n", __func__);
1254} 1255}
1256EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1257
1258static void
1259pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1260 struct pnfs_layout_segment *lseg,
1261 struct list_head *free_me)
1262{
1263 struct inode *inode = lo->plh_inode;
1264 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1265
1266 if (ld->add_lseg != NULL)
1267 ld->add_lseg(lo, lseg, free_me);
1268 else
1269 pnfs_generic_layout_insert_lseg(lo, lseg,
1270 pnfs_lseg_range_is_after,
1271 pnfs_lseg_no_merge,
1272 free_me);
1273}
1255 1274
1256static struct pnfs_layout_hdr * 1275static struct pnfs_layout_hdr *
1257alloc_init_layout_hdr(struct inode *ino, 1276alloc_init_layout_hdr(struct inode *ino,
@@ -1344,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1344 ret = pnfs_get_lseg(lseg); 1363 ret = pnfs_get_lseg(lseg);
1345 break; 1364 break;
1346 } 1365 }
1347 if (lseg->pls_range.offset > range->offset)
1348 break;
1349 } 1366 }
1350 1367
1351 dprintk("%s:Return lseg %p ref %d\n", 1368 dprintk("%s:Return lseg %p ref %d\n",
@@ -1438,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
1438 1455
1439static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) 1456static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1440{ 1457{
1458 if (!pnfs_should_retry_layoutget(lo))
1459 return false;
1441 /* 1460 /*
1442 * send layoutcommit as it can hold up layoutreturn due to lseg 1461 * send layoutcommit as it can hold up layoutreturn due to lseg
1443 * reference 1462 * reference
@@ -1484,6 +1503,9 @@ pnfs_update_layout(struct inode *ino,
1484 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 1503 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
1485 goto out; 1504 goto out;
1486 1505
1506 if (iomode == IOMODE_READ && i_size_read(ino) == 0)
1507 goto out;
1508
1487 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1509 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1488 goto out; 1510 goto out;
1489 1511
@@ -1533,8 +1555,7 @@ lookup_again:
1533 * Because we free lsegs before sending LAYOUTRETURN, we need to wait 1555 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1534 * for LAYOUTRETURN even if first is true. 1556 * for LAYOUTRETURN even if first is true.
1535 */ 1557 */
1536 if (!lseg && pnfs_should_retry_layoutget(lo) && 1558 if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1537 test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1538 spin_unlock(&ino->i_lock); 1559 spin_unlock(&ino->i_lock);
1539 dprintk("%s wait for layoutreturn\n", __func__); 1560 dprintk("%s wait for layoutreturn\n", __func__);
1540 if (pnfs_prepare_to_retry_layoutget(lo)) { 1561 if (pnfs_prepare_to_retry_layoutget(lo)) {
@@ -1547,7 +1568,7 @@ lookup_again:
1547 goto out_put_layout_hdr; 1568 goto out_put_layout_hdr;
1548 } 1569 }
1549 1570
1550 if (pnfs_layoutgets_blocked(lo, &arg, 0)) 1571 if (pnfs_layoutgets_blocked(lo))
1551 goto out_unlock; 1572 goto out_unlock;
1552 atomic_inc(&lo->plh_outstanding); 1573 atomic_inc(&lo->plh_outstanding);
1553 spin_unlock(&ino->i_lock); 1574 spin_unlock(&ino->i_lock);
@@ -1593,6 +1614,26 @@ out_unlock:
1593} 1614}
1594EXPORT_SYMBOL_GPL(pnfs_update_layout); 1615EXPORT_SYMBOL_GPL(pnfs_update_layout);
1595 1616
1617static bool
1618pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
1619{
1620 switch (range->iomode) {
1621 case IOMODE_READ:
1622 case IOMODE_RW:
1623 break;
1624 default:
1625 return false;
1626 }
1627 if (range->offset == NFS4_MAX_UINT64)
1628 return false;
1629 if (range->length == 0)
1630 return false;
1631 if (range->length != NFS4_MAX_UINT64 &&
1632 range->length > NFS4_MAX_UINT64 - range->offset)
1633 return false;
1634 return true;
1635}
1636
1596struct pnfs_layout_segment * 1637struct pnfs_layout_segment *
1597pnfs_layout_process(struct nfs4_layoutget *lgp) 1638pnfs_layout_process(struct nfs4_layoutget *lgp)
1598{ 1639{
@@ -1601,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1601 struct pnfs_layout_segment *lseg; 1642 struct pnfs_layout_segment *lseg;
1602 struct inode *ino = lo->plh_inode; 1643 struct inode *ino = lo->plh_inode;
1603 LIST_HEAD(free_me); 1644 LIST_HEAD(free_me);
1604 int status = 0; 1645 int status = -EINVAL;
1646
1647 if (!pnfs_sanity_check_layout_range(&res->range))
1648 goto out;
1605 1649
1606 /* Inject layout blob into I/O device driver */ 1650 /* Inject layout blob into I/O device driver */
1607 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1651 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
@@ -1619,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1619 lseg->pls_range = res->range; 1663 lseg->pls_range = res->range;
1620 1664
1621 spin_lock(&ino->i_lock); 1665 spin_lock(&ino->i_lock);
1622 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1666 if (pnfs_layoutgets_blocked(lo)) {
1623 dprintk("%s forget reply due to recall\n", __func__);
1624 goto out_forget_reply;
1625 }
1626
1627 if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
1628 dprintk("%s forget reply due to state\n", __func__); 1667 dprintk("%s forget reply due to state\n", __func__);
1629 goto out_forget_reply; 1668 goto out_forget_reply;
1630 } 1669 }
@@ -1651,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1651 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 1690 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1652 1691
1653 pnfs_get_lseg(lseg); 1692 pnfs_get_lseg(lseg);
1654 pnfs_layout_insert_lseg(lo, lseg); 1693 pnfs_layout_insert_lseg(lo, lseg, &free_me);
1655 1694
1656 if (res->return_on_close) { 1695 if (res->return_on_close)
1657 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1696 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1658 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
1659 }
1660 1697
1661 spin_unlock(&ino->i_lock); 1698 spin_unlock(&ino->i_lock);
1662 pnfs_free_lseg_list(&free_me); 1699 pnfs_free_lseg_list(&free_me);
@@ -1692,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1692 lseg->pls_range.length); 1729 lseg->pls_range.length);
1693 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); 1730 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1694 mark_lseg_invalid(lseg, tmp_list); 1731 mark_lseg_invalid(lseg, tmp_list);
1732 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1733 &lo->plh_flags);
1695 } 1734 }
1696} 1735}
1697 1736
@@ -2267,7 +2306,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2267 2306
2268#if IS_ENABLED(CONFIG_NFS_V4_2) 2307#if IS_ENABLED(CONFIG_NFS_V4_2)
2269int 2308int
2270pnfs_report_layoutstat(struct inode *inode) 2309pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2271{ 2310{
2272 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 2311 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2273 struct nfs_server *server = NFS_SERVER(inode); 2312 struct nfs_server *server = NFS_SERVER(inode);
@@ -2294,7 +2333,7 @@ pnfs_report_layoutstat(struct inode *inode)
2294 pnfs_get_layout_hdr(hdr); 2333 pnfs_get_layout_hdr(hdr);
2295 spin_unlock(&inode->i_lock); 2334 spin_unlock(&inode->i_lock);
2296 2335
2297 data = kzalloc(sizeof(*data), GFP_KERNEL); 2336 data = kzalloc(sizeof(*data), gfp_flags);
2298 if (!data) { 2337 if (!data) {
2299 status = -ENOMEM; 2338 status = -ENOMEM;
2300 goto out_put; 2339 goto out_put;
@@ -2324,3 +2363,7 @@ out_put:
2324} 2363}
2325EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); 2364EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
2326#endif 2365#endif
2366
2367unsigned int layoutstats_timer;
2368module_param(layoutstats_timer, uint, 0644);
2369EXPORT_SYMBOL_GPL(layoutstats_timer);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3e6ab7bfbabd..78c9351ff117 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,7 +94,6 @@ enum {
94 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ 94 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
95 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 95 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
96 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 96 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
97 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
98 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 97 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
99 NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ 98 NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
100 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ 99 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
@@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type {
129 128
130 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 129 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
131 void (*free_lseg) (struct pnfs_layout_segment *lseg); 130 void (*free_lseg) (struct pnfs_layout_segment *lseg);
131 void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
132 struct pnfs_layout_segment *lseg,
133 struct list_head *free_me);
132 134
133 void (*return_range) (struct pnfs_layout_hdr *lo, 135 void (*return_range) (struct pnfs_layout_hdr *lo,
134 struct pnfs_layout_range *range); 136 struct pnfs_layout_range *range);
@@ -184,15 +186,15 @@ struct pnfs_layoutdriver_type {
184 186
185struct pnfs_layout_hdr { 187struct pnfs_layout_hdr {
186 atomic_t plh_refcount; 188 atomic_t plh_refcount;
189 atomic_t plh_outstanding; /* number of RPCs out */
187 struct list_head plh_layouts; /* other client layouts */ 190 struct list_head plh_layouts; /* other client layouts */
188 struct list_head plh_bulk_destroy; 191 struct list_head plh_bulk_destroy;
189 struct list_head plh_segs; /* layout segments list */ 192 struct list_head plh_segs; /* layout segments list */
190 nfs4_stateid plh_stateid;
191 atomic_t plh_outstanding; /* number of RPCs out */
192 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ 193 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
193 u32 plh_barrier; /* ignore lower seqids */
194 unsigned long plh_retry_timestamp; 194 unsigned long plh_retry_timestamp;
195 unsigned long plh_flags; 195 unsigned long plh_flags;
196 nfs4_stateid plh_stateid;
197 u32 plh_barrier; /* ignore lower seqids */
196 enum pnfs_iomode plh_return_iomode; 198 enum pnfs_iomode plh_return_iomode;
197 loff_t plh_lwb; /* last write byte for layoutcommit */ 199 loff_t plh_lwb; /* last write byte for layoutcommit */
198 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 200 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -267,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
267bool pnfs_roc(struct inode *ino); 269bool pnfs_roc(struct inode *ino);
268void pnfs_roc_release(struct inode *ino); 270void pnfs_roc_release(struct inode *ino);
269void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 271void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
270bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); 272void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
271void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); 273void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
272void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 274void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
273int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 275int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -286,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
286 gfp_t gfp_flags); 288 gfp_t gfp_flags);
287void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); 289void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
288 290
291void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
292 struct pnfs_layout_segment *lseg,
293 bool (*is_after)(const struct pnfs_layout_range *lseg_range,
294 const struct pnfs_layout_range *old),
295 bool (*do_merge)(struct pnfs_layout_segment *lseg,
296 struct pnfs_layout_segment *old),
297 struct list_head *free_me);
298
289void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); 299void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
290int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); 300int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
291int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); 301int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
@@ -529,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
529 nfss->pnfs_curr_ld->id == src->l_type); 539 nfss->pnfs_curr_ld->id == src->l_type);
530} 540}
531 541
542static inline u64
543pnfs_calc_offset_end(u64 offset, u64 len)
544{
545 if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
546 return NFS4_MAX_UINT64;
547 return offset + len - 1;
548}
549
550static inline u64
551pnfs_calc_offset_length(u64 offset, u64 end)
552{
553 if (end == NFS4_MAX_UINT64 || end <= offset)
554 return NFS4_MAX_UINT64;
555 return 1 + end - offset;
556}
557
558extern unsigned int layoutstats_timer;
559
532#ifdef NFS_DEBUG 560#ifdef NFS_DEBUG
533void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); 561void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
534#else 562#else
535static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) 563static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
536{ 564{
537} 565}
566
538#endif /* NFS_DEBUG */ 567#endif /* NFS_DEBUG */
539#else /* CONFIG_NFS_V4_1 */ 568#else /* CONFIG_NFS_V4_1 */
540 569
@@ -605,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
605{ 634{
606} 635}
607 636
608static inline bool 637static inline void
609pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) 638pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
610{ 639{
611 return false;
612} 640}
613 641
614static inline void set_pnfs_layoutdriver(struct nfs_server *s, 642static inline void set_pnfs_layoutdriver(struct nfs_server *s,
@@ -691,10 +719,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
691#endif /* CONFIG_NFS_V4_1 */ 719#endif /* CONFIG_NFS_V4_1 */
692 720
693#if IS_ENABLED(CONFIG_NFS_V4_2) 721#if IS_ENABLED(CONFIG_NFS_V4_2)
694int pnfs_report_layoutstat(struct inode *inode); 722int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
695#else 723#else
696static inline int 724static inline int
697pnfs_report_layoutstat(struct inode *inode) 725pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
698{ 726{
699 return 0; 727 return 0;
700} 728}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f37e25b6311c..24655b807d44 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
124 if (ret) { 124 if (ret) {
125 cinfo->ds->nwritten -= ret; 125 cinfo->ds->nwritten -= ret;
126 cinfo->ds->ncommitting += ret; 126 cinfo->ds->ncommitting += ret;
127 bucket->clseg = bucket->wlseg; 127 if (bucket->clseg == NULL)
128 if (list_empty(src)) 128 bucket->clseg = pnfs_get_lseg(bucket->wlseg);
129 if (list_empty(src)) {
130 pnfs_put_lseg_locked(bucket->wlseg);
129 bucket->wlseg = NULL; 131 bucket->wlseg = NULL;
130 else 132 }
131 pnfs_get_lseg(bucket->clseg);
132 } 133 }
133 return ret; 134 return ret;
134} 135}
@@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
182 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; 183 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
183 struct pnfs_commit_bucket *bucket; 184 struct pnfs_commit_bucket *bucket;
184 struct pnfs_layout_segment *freeme; 185 struct pnfs_layout_segment *freeme;
186 LIST_HEAD(pages);
185 int i; 187 int i;
186 188
189 spin_lock(cinfo->lock);
187 for (i = idx; i < fl_cinfo->nbuckets; i++) { 190 for (i = idx; i < fl_cinfo->nbuckets; i++) {
188 bucket = &fl_cinfo->buckets[i]; 191 bucket = &fl_cinfo->buckets[i];
189 if (list_empty(&bucket->committing)) 192 if (list_empty(&bucket->committing))
190 continue; 193 continue;
191 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
192 spin_lock(cinfo->lock);
193 freeme = bucket->clseg; 194 freeme = bucket->clseg;
194 bucket->clseg = NULL; 195 bucket->clseg = NULL;
196 list_splice_init(&bucket->committing, &pages);
195 spin_unlock(cinfo->lock); 197 spin_unlock(cinfo->lock);
198 nfs_retry_commit(&pages, freeme, cinfo, i);
196 pnfs_put_lseg(freeme); 199 pnfs_put_lseg(freeme);
200 spin_lock(cinfo->lock);
197 } 201 }
202 spin_unlock(cinfo->lock);
198} 203}
199 204
200static unsigned int 205static unsigned int
@@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
216 if (!data) 221 if (!data)
217 break; 222 break;
218 data->ds_commit_index = i; 223 data->ds_commit_index = i;
219 spin_lock(cinfo->lock);
220 data->lseg = bucket->clseg;
221 bucket->clseg = NULL;
222 spin_unlock(cinfo->lock);
223 list_add(&data->pages, list); 224 list_add(&data->pages, list);
224 nreq++; 225 nreq++;
225 } 226 }
@@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
229 return nreq; 230 return nreq;
230} 231}
231 232
233static inline
234void pnfs_fetch_commit_bucket_list(struct list_head *pages,
235 struct nfs_commit_data *data,
236 struct nfs_commit_info *cinfo)
237{
238 struct pnfs_commit_bucket *bucket;
239
240 bucket = &cinfo->ds->buckets[data->ds_commit_index];
241 spin_lock(cinfo->lock);
242 list_splice_init(&bucket->committing, pages);
243 data->lseg = bucket->clseg;
244 bucket->clseg = NULL;
245 spin_unlock(cinfo->lock);
246
247}
248
232/* This follows nfs_commit_list pretty closely */ 249/* This follows nfs_commit_list pretty closely */
233int 250int
234pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, 251pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
243 if (!list_empty(mds_pages)) { 260 if (!list_empty(mds_pages)) {
244 data = nfs_commitdata_alloc(); 261 data = nfs_commitdata_alloc();
245 if (data != NULL) { 262 if (data != NULL) {
246 data->lseg = NULL; 263 data->ds_commit_index = -1;
247 list_add(&data->pages, &list); 264 list_add(&data->pages, &list);
248 nreq++; 265 nreq++;
249 } else { 266 } else {
@@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
265 282
266 list_for_each_entry_safe(data, tmp, &list, pages) { 283 list_for_each_entry_safe(data, tmp, &list, pages) {
267 list_del_init(&data->pages); 284 list_del_init(&data->pages);
268 if (!data->lseg) { 285 if (data->ds_commit_index < 0) {
269 nfs_init_commit(data, mds_pages, NULL, cinfo); 286 nfs_init_commit(data, mds_pages, NULL, cinfo);
270 nfs_initiate_commit(NFS_CLIENT(inode), data, 287 nfs_initiate_commit(NFS_CLIENT(inode), data,
271 NFS_PROTO(data->inode), 288 NFS_PROTO(data->inode),
272 data->mds_ops, how, 0); 289 data->mds_ops, how, 0);
273 } else { 290 } else {
274 struct pnfs_commit_bucket *buckets; 291 LIST_HEAD(pages);
275 292
276 buckets = cinfo->ds->buckets; 293 pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
277 nfs_init_commit(data, 294 nfs_init_commit(data, &pages, data->lseg, cinfo);
278 &buckets[data->ds_commit_index].committing,
279 data->lseg,
280 cinfo);
281 initiate_commit(data, how); 295 initiate_commit(data, how);
282 } 296 }
283 } 297 }
@@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
359 return false; 373 return false;
360} 374}
361 375
376/*
377 * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
378 * declare a match.
379 */
362static bool 380static bool
363_same_data_server_addrs_locked(const struct list_head *dsaddrs1, 381_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
364 const struct list_head *dsaddrs2) 382 const struct list_head *dsaddrs2)
365{ 383{
366 struct nfs4_pnfs_ds_addr *da1, *da2; 384 struct nfs4_pnfs_ds_addr *da1, *da2;
367 385 struct sockaddr *sa1, *sa2;
368 /* step through both lists, comparing as we go */ 386 bool match = false;
369 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), 387
370 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); 388 list_for_each_entry(da1, dsaddrs1, da_node) {
371 da1 != NULL && da2 != NULL; 389 sa1 = (struct sockaddr *)&da1->da_addr;
372 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), 390 match = false;
373 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { 391 list_for_each_entry(da2, dsaddrs2, da_node) {
374 if (!same_sockaddr((struct sockaddr *)&da1->da_addr, 392 sa2 = (struct sockaddr *)&da2->da_addr;
375 (struct sockaddr *)&da2->da_addr)) 393 match = same_sockaddr(sa1, sa2);
376 return false; 394 if (match)
395 break;
396 }
397 if (!match)
398 break;
377 } 399 }
378 if (da1 == NULL && da2 == NULL) 400 return match;
379 return true;
380
381 return false;
382} 401}
383 402
384/* 403/*
@@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
863 } 882 }
864 set_bit(PG_COMMIT_TO_DS, &req->wb_flags); 883 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
865 cinfo->ds->nwritten++; 884 cinfo->ds->nwritten++;
866 spin_unlock(cinfo->lock);
867 885
868 nfs_request_add_commit_list(req, list, cinfo); 886 nfs_request_add_commit_list_locked(req, list, cinfo);
887 spin_unlock(cinfo->lock);
888 nfs_mark_page_unstable(req->wb_page, cinfo);
869} 889}
870EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); 890EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
871 891
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aa62004f1706..383a027de452 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -381,9 +381,12 @@ int __init register_nfs_fs(void)
381 ret = nfs_register_sysctl(); 381 ret = nfs_register_sysctl();
382 if (ret < 0) 382 if (ret < 0)
383 goto error_2; 383 goto error_2;
384 register_shrinker(&acl_shrinker); 384 ret = register_shrinker(&acl_shrinker);
385 if (ret < 0)
386 goto error_3;
385 return 0; 387 return 0;
386 388error_3:
389 nfs_unregister_sysctl();
387error_2: 390error_2:
388 unregister_nfs4_fs(); 391 unregister_nfs4_fs();
389error_1: 392error_1:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75a35a1afa79..388f48079c43 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -768,6 +768,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
768} 768}
769 769
770/** 770/**
771 * nfs_request_add_commit_list_locked - add request to a commit list
772 * @req: pointer to a struct nfs_page
773 * @dst: commit list head
774 * @cinfo: holds list lock and accounting info
775 *
776 * This sets the PG_CLEAN bit, updates the cinfo count of
777 * number of outstanding requests requiring a commit as well as
778 * the MM page stats.
779 *
780 * The caller must hold the cinfo->lock, and the nfs_page lock.
781 */
782void
783nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
784 struct nfs_commit_info *cinfo)
785{
786 set_bit(PG_CLEAN, &req->wb_flags);
787 nfs_list_add_request(req, dst);
788 cinfo->mds->ncommit++;
789}
790EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
791
792/**
771 * nfs_request_add_commit_list - add request to a commit list 793 * nfs_request_add_commit_list - add request to a commit list
772 * @req: pointer to a struct nfs_page 794 * @req: pointer to a struct nfs_page
773 * @dst: commit list head 795 * @dst: commit list head
@@ -784,13 +806,10 @@ void
784nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, 806nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
785 struct nfs_commit_info *cinfo) 807 struct nfs_commit_info *cinfo)
786{ 808{
787 set_bit(PG_CLEAN, &(req)->wb_flags);
788 spin_lock(cinfo->lock); 809 spin_lock(cinfo->lock);
789 nfs_list_add_request(req, dst); 810 nfs_request_add_commit_list_locked(req, dst, cinfo);
790 cinfo->mds->ncommit++;
791 spin_unlock(cinfo->lock); 811 spin_unlock(cinfo->lock);
792 if (!cinfo->dreq) 812 nfs_mark_page_unstable(req->wb_page, cinfo);
793 nfs_mark_page_unstable(req->wb_page);
794} 813}
795EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); 814EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
796 815
@@ -1793,7 +1812,7 @@ out_mark_dirty:
1793 return res; 1812 return res;
1794} 1813}
1795 1814
1796static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) 1815int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1797{ 1816{
1798 struct nfs_inode *nfsi = NFS_I(inode); 1817 struct nfs_inode *nfsi = NFS_I(inode);
1799 int flags = FLUSH_SYNC; 1818 int flags = FLUSH_SYNC;
@@ -1828,11 +1847,6 @@ out_mark_dirty:
1828 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1847 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1829 return ret; 1848 return ret;
1830} 1849}
1831
1832int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1833{
1834 return nfs_commit_unstable_pages(inode, wbc);
1835}
1836EXPORT_SYMBOL_GPL(nfs_write_inode); 1850EXPORT_SYMBOL_GPL(nfs_write_inode);
1837 1851
1838/* 1852/*
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9aa2796da90d..6d834dc9bbc8 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
101 } 101 }
102 102
103 nr_iomaps = be32_to_cpup(p++); 103 nr_iomaps = be32_to_cpup(p++);
104 expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; 104 expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
105 if (len != expected) { 105 if (len != expected) {
106 dprintk("%s: extent array size mismatch: %u/%u\n", 106 dprintk("%s: extent array size mismatch: %u/%u\n",
107 __func__, len, expected); 107 __func__, len, expected);
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index fdc79037c0e7..6de925fe8499 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -7,13 +7,6 @@
7struct iomap; 7struct iomap;
8struct xdr_stream; 8struct xdr_stream;
9 9
10enum pnfs_block_extent_state {
11 PNFS_BLOCK_READWRITE_DATA = 0,
12 PNFS_BLOCK_READ_DATA = 1,
13 PNFS_BLOCK_INVALID_DATA = 2,
14 PNFS_BLOCK_NONE_DATA = 3,
15};
16
17struct pnfs_block_extent { 10struct pnfs_block_extent {
18 struct nfsd4_deviceid vol_id; 11 struct nfsd4_deviceid vol_id;
19 u64 foff; 12 u64 foff;
@@ -21,14 +14,6 @@ struct pnfs_block_extent {
21 u64 soff; 14 u64 soff;
22 enum pnfs_block_extent_state es; 15 enum pnfs_block_extent_state es;
23}; 16};
24#define NFS4_BLOCK_EXTENT_SIZE 44
25
26enum pnfs_block_volume_type {
27 PNFS_BLOCK_VOLUME_SIMPLE = 0,
28 PNFS_BLOCK_VOLUME_SLICE = 1,
29 PNFS_BLOCK_VOLUME_CONCAT = 2,
30 PNFS_BLOCK_VOLUME_STRIPE = 3,
31};
32 17
33/* 18/*
34 * Random upper cap for the uuid length to avoid unbounded allocation. 19 * Random upper cap for the uuid length to avoid unbounded allocation.