aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-30 19:33:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-30 19:33:25 -0400
commit7f155c702677d057d03b192ce652311de5434697 (patch)
treedcee0fbb463ec3e55cb50181180c7d175d5895c3
parentd761f3ed6e71bcca724a6e9e39efcac65b7b4ac1 (diff)
parent944171cbf499d3445c749f7c13c46de0a564a905 (diff)
Merge tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: Stable bugfixes: - nfs: don't create zero-length requests - several LAYOUTGET bugfixes Features: - several performance related features - more aggressive caching when we can rely on close-to-open cache consistency - remove serialisation of O_DIRECT reads and writes - optimise several code paths to not flush to disk unnecessarily. However allow for the idiosyncracies of pNFS for those layout types that need to issue a LAYOUTCOMMIT before the metadata can be updated on the server. - SUNRPC updates to the client data receive path - pNFS/SCSI support RH/Fedora dm-mpath device nodes - pNFS files/flexfiles can now use unprivileged ports when the generic NFS mount options allow it. Bugfixes: - Don't use RDMA direct data placement together with data integrity or privacy security flavours - Remove the RDMA ALLPHYSICAL memory registration mode as it has potential security holes. - Several layout recall fixes to improve NFSv4.1 protocol compliance. - Fix an Oops in the pNFS files and flexfiles connection setup to the DS - Allow retry of operations that used a returned delegation stateid - Don't mark the inode as revalidated if a LAYOUTCOMMIT is outstanding - Fix writeback races in nfs4_copy_range() and nfs42_proc_deallocate()" * tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (104 commits) pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding NFSv4: Clean up lookup of SECINFO_NO_NAME NFSv4.2: Fix warning "variable ‘stateids’ set but not used" NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’" SUNRPC: Fix a compiler warning in fs/nfs/clnt.c pNFS: Remove redundant smp_mb() from pnfs_init_lseg() pNFS: Cleanup - do layout segment initialisation in one place pNFS: Remove redundant stateid invalidation pNFS: Remove redundant pnfs_mark_layout_returned_if_empty() pNFS: Clear the layout metadata if the server changed the layout stateid pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid() NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id pNFS: Do not set plh_return_seq for non-callback related layoutreturns pNFS: Ensure layoutreturn acts as a completion for layout callbacks pNFS: Fix CB_LAYOUTRECALL stateid verification pNFS: Always update the layout barrier seqid on LAYOUTGET pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set pNFS: Clear the layout return tracking on layout reinitialisation pNFS: LAYOUTRETURN should only update the stateid if the layout is valid nfs: don't create zero-length requests ...
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/dev.c110
-rw-r--r--fs/nfs/blocklayout/extent_tree.c27
-rw-r--r--fs/nfs/callback_proc.c64
-rw-r--r--fs/nfs/callback_xdr.c6
-rw-r--r--fs/nfs/client.c22
-rw-r--r--fs/nfs/dir.c52
-rw-r--r--fs/nfs/direct.c93
-rw-r--r--fs/nfs/file.c100
-rw-r--r--fs/nfs/filelayout/filelayout.c18
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c23
-rw-r--r--fs/nfs/inode.c138
-rw-r--r--fs/nfs/internal.h62
-rw-r--r--fs/nfs/io.c147
-rw-r--r--fs/nfs/nfs3client.c14
-rw-r--r--fs/nfs/nfs42proc.c24
-rw-r--r--fs/nfs/nfs42xdr.c12
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c26
-rw-r--r--fs/nfs/nfs4file.c16
-rw-r--r--fs/nfs/nfs4proc.c153
-rw-r--r--fs/nfs/nfs4xdr.c11
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pnfs.c191
-rw-r--r--fs/nfs/pnfs.h34
-rw-r--r--fs/nfs/pnfs_nfs.c13
-rw-r--r--fs/nfs/super.c14
-rw-r--r--fs/nfs/write.c44
-rw-r--r--include/linux/nfs_fs.h3
-rw-r--r--include/linux/nfs_xdr.h5
-rw-r--r--include/linux/sunrpc/auth.h9
-rw-r--r--include/linux/sunrpc/gss_api.h2
-rw-r--r--include/linux/sunrpc/sched.h5
-rw-r--r--include/linux/sunrpc/xprtsock.h1
-rw-r--r--net/sunrpc/auth.c8
-rw-r--r--net/sunrpc/auth_generic.c9
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c3
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c12
-rw-r--r--net/sunrpc/auth_null.c1
-rw-r--r--net/sunrpc/auth_unix.c1
-rw-r--r--net/sunrpc/clnt.c2
-rw-r--r--net/sunrpc/sched.c67
-rw-r--r--net/sunrpc/svc.c8
-rw-r--r--net/sunrpc/xprt.c14
-rw-r--r--net/sunrpc/xprtmultipath.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c378
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c369
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c122
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c274
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c242
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h118
-rw-r--r--net/sunrpc/xprtsock.c125
55 files changed, 1748 insertions, 1500 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8664417955a2..6abdda209642 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7CFLAGS_nfstrace.o += -I$(src) 7CFLAGS_nfstrace.o += -I$(src)
8nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ 8nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
9 direct.o pagelist.o read.o symlink.o unlink.o \ 9 io.o direct.o pagelist.o read.o symlink.o unlink.o \
10 write.o namespace.o mount_clnt.o nfstrace.o 10 write.o namespace.o mount_clnt.o nfstrace.o
11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
12nfs-$(CONFIG_SYSCTL) += sysctl.o 12nfs-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e5b89675263e..a69ef4e9c24c 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
65 if (!p) 65 if (!p)
66 return -EIO; 66 return -EIO;
67 b->simple.nr_sigs = be32_to_cpup(p++); 67 b->simple.nr_sigs = be32_to_cpup(p++);
68 if (!b->simple.nr_sigs) { 68 if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
69 dprintk("no signature\n"); 69 dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
70 return -EIO; 70 return -EIO;
71 } 71 }
72 72
@@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
89 memcpy(&b->simple.sigs[i].sig, p, 89 memcpy(&b->simple.sigs[i].sig, p,
90 b->simple.sigs[i].sig_len); 90 b->simple.sigs[i].sig_len);
91 91
92 b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; 92 b->simple.len += 8 + 4 + \
93 (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
93 } 94 }
94 break; 95 break;
95 case PNFS_BLOCK_VOLUME_SLICE: 96 case PNFS_BLOCK_VOLUME_SLICE:
@@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
104 p = xdr_inline_decode(xdr, 4); 105 p = xdr_inline_decode(xdr, 4);
105 if (!p) 106 if (!p)
106 return -EIO; 107 return -EIO;
108
107 b->concat.volumes_count = be32_to_cpup(p++); 109 b->concat.volumes_count = be32_to_cpup(p++);
110 if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
111 dprintk("Too many volumes: %d\n", b->concat.volumes_count);
112 return -EIO;
113 }
108 114
109 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); 115 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
110 if (!p) 116 if (!p)
@@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
116 p = xdr_inline_decode(xdr, 8 + 4); 122 p = xdr_inline_decode(xdr, 8 + 4);
117 if (!p) 123 if (!p)
118 return -EIO; 124 return -EIO;
125
119 p = xdr_decode_hyper(p, &b->stripe.chunk_size); 126 p = xdr_decode_hyper(p, &b->stripe.chunk_size);
120 b->stripe.volumes_count = be32_to_cpup(p++); 127 b->stripe.volumes_count = be32_to_cpup(p++);
128 if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
129 dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
130 return -EIO;
131 }
121 132
122 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); 133 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
123 if (!p) 134 if (!p)
@@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
224 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) 235 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
225{ 236{
226 struct pnfs_block_volume *v = &volumes[idx]; 237 struct pnfs_block_volume *v = &volumes[idx];
238 struct block_device *bdev;
227 dev_t dev; 239 dev_t dev;
228 240
229 dev = bl_resolve_deviceid(server, v, gfp_mask); 241 dev = bl_resolve_deviceid(server, v, gfp_mask);
230 if (!dev) 242 if (!dev)
231 return -EIO; 243 return -EIO;
232 244
233 d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); 245 bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
234 if (IS_ERR(d->bdev)) { 246 if (IS_ERR(bdev)) {
235 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", 247 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
236 MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); 248 MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
237 return PTR_ERR(d->bdev); 249 return PTR_ERR(bdev);
238 } 250 }
251 d->bdev = bdev;
239 252
240 253
241 d->len = i_size_read(d->bdev->bd_inode); 254 d->len = i_size_read(d->bdev->bd_inode);
@@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
287 } 300 }
288} 301}
289 302
303/*
304 * Try to open the udev path for the WWN. At least on Debian the udev
305 * by-id path will always point to the dm-multipath device if one exists.
306 */
307static struct block_device *
308bl_open_udev_path(struct pnfs_block_volume *v)
309{
310 struct block_device *bdev;
311 const char *devname;
312
313 devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
314 v->scsi.designator_len, v->scsi.designator);
315 if (!devname)
316 return ERR_PTR(-ENOMEM);
317
318 bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
319 if (IS_ERR(bdev)) {
320 pr_warn("pNFS: failed to open device %s (%ld)\n",
321 devname, PTR_ERR(bdev));
322 }
323
324 kfree(devname);
325 return bdev;
326}
327
328/*
329 * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
330 * wwn- links will only point to the first discovered SCSI device there.
331 */
332static struct block_device *
333bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
334{
335 struct block_device *bdev;
336 const char *devname;
337
338 devname = kasprintf(GFP_KERNEL,
339 "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
340 v->scsi.designator_type,
341 v->scsi.designator_len, v->scsi.designator);
342 if (!devname)
343 return ERR_PTR(-ENOMEM);
344
345 bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
346 kfree(devname);
347 return bdev;
348}
349
290static int 350static int
291bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, 351bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
292 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) 352 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
293{ 353{
294 struct pnfs_block_volume *v = &volumes[idx]; 354 struct pnfs_block_volume *v = &volumes[idx];
355 struct block_device *bdev;
295 const struct pr_ops *ops; 356 const struct pr_ops *ops;
296 const char *devname;
297 int error; 357 int error;
298 358
299 if (!bl_validate_designator(v)) 359 if (!bl_validate_designator(v))
300 return -EINVAL; 360 return -EINVAL;
301 361
302 switch (v->scsi.designator_len) { 362 bdev = bl_open_dm_mpath_udev_path(v);
303 case 8: 363 if (IS_ERR(bdev))
304 devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", 364 bdev = bl_open_udev_path(v);
305 v->scsi.designator); 365 if (IS_ERR(bdev))
306 break; 366 return PTR_ERR(bdev);
307 case 12: 367 d->bdev = bdev;
308 devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
309 v->scsi.designator);
310 break;
311 case 16:
312 devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
313 v->scsi.designator);
314 break;
315 default:
316 return -EINVAL;
317 }
318
319 d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
320 if (IS_ERR(d->bdev)) {
321 pr_warn("pNFS: failed to open device %s (%ld)\n",
322 devname, PTR_ERR(d->bdev));
323 kfree(devname);
324 return PTR_ERR(d->bdev);
325 }
326
327 kfree(devname);
328 368
329 d->len = i_size_read(d->bdev->bd_inode); 369 d->len = i_size_read(d->bdev->bd_inode);
330 d->map = bl_map_simple; 370 d->map = bl_map_simple;
@@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
352 return 0; 392 return 0;
353 393
354out_blkdev_put: 394out_blkdev_put:
355 blkdev_put(d->bdev, FMODE_READ); 395 blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
356 return error; 396 return error;
357} 397}
358 398
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 720b3ff55fa9..992bcb19c11e 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
121 return be; 121 return be;
122} 122}
123 123
124static void __ext_put_deviceids(struct list_head *head)
125{
126 struct pnfs_block_extent *be, *tmp;
127
128 list_for_each_entry_safe(be, tmp, head, be_list) {
129 nfs4_put_deviceid_node(be->be_device);
130 kfree(be);
131 }
132}
133
124static void 134static void
125__ext_tree_insert(struct rb_root *root, 135__ext_tree_insert(struct rb_root *root,
126 struct pnfs_block_extent *new, bool merge_ok) 136 struct pnfs_block_extent *new, bool merge_ok)
@@ -163,7 +173,8 @@ free_new:
163} 173}
164 174
165static int 175static int
166__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) 176__ext_tree_remove(struct rb_root *root,
177 sector_t start, sector_t end, struct list_head *tmp)
167{ 178{
168 struct pnfs_block_extent *be; 179 struct pnfs_block_extent *be;
169 sector_t len1 = 0, len2 = 0; 180 sector_t len1 = 0, len2 = 0;
@@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
223 struct pnfs_block_extent *next = ext_tree_next(be); 234 struct pnfs_block_extent *next = ext_tree_next(be);
224 235
225 rb_erase(&be->be_node, root); 236 rb_erase(&be->be_node, root);
226 nfs4_put_deviceid_node(be->be_device); 237 list_add_tail(&be->be_list, tmp);
227 kfree(be);
228 be = next; 238 be = next;
229 } 239 }
230 240
@@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
350 sector_t start, sector_t end) 360 sector_t start, sector_t end)
351{ 361{
352 int err, err2; 362 int err, err2;
363 LIST_HEAD(tmp);
353 364
354 spin_lock(&bl->bl_ext_lock); 365 spin_lock(&bl->bl_ext_lock);
355 err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 366 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
356 if (rw) { 367 if (rw) {
357 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); 368 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
358 if (!err) 369 if (!err)
359 err = err2; 370 err = err2;
360 } 371 }
361 spin_unlock(&bl->bl_ext_lock); 372 spin_unlock(&bl->bl_ext_lock);
362 373
374 __ext_put_deviceids(&tmp);
363 return err; 375 return err;
364} 376}
365 377
@@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
396 sector_t end = start + len; 408 sector_t end = start + len;
397 struct pnfs_block_extent *be; 409 struct pnfs_block_extent *be;
398 int err = 0; 410 int err = 0;
411 LIST_HEAD(tmp);
399 412
400 spin_lock(&bl->bl_ext_lock); 413 spin_lock(&bl->bl_ext_lock);
401 /* 414 /*
402 * First remove all COW extents or holes from written to range. 415 * First remove all COW extents or holes from written to range.
403 */ 416 */
404 err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 417 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
405 if (err) 418 if (err)
406 goto out; 419 goto out;
407 420
@@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
459 } 472 }
460out: 473out:
461 spin_unlock(&bl->bl_ext_lock); 474 spin_unlock(&bl->bl_ext_lock);
475
476 __ext_put_deviceids(&tmp);
462 return err; 477 return err;
463} 478}
464 479
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index aaa2e8d3df6f..c92a75e066a6 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -119,27 +119,30 @@ out:
119 * hashed by filehandle. 119 * hashed by filehandle.
120 */ 120 */
121static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, 121static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
122 struct nfs_fh *fh, nfs4_stateid *stateid) 122 struct nfs_fh *fh)
123{ 123{
124 struct nfs_server *server; 124 struct nfs_server *server;
125 struct nfs_inode *nfsi;
125 struct inode *ino; 126 struct inode *ino;
126 struct pnfs_layout_hdr *lo; 127 struct pnfs_layout_hdr *lo;
127 128
129restart:
128 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 130 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
129 list_for_each_entry(lo, &server->layouts, plh_layouts) { 131 list_for_each_entry(lo, &server->layouts, plh_layouts) {
130 if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) 132 nfsi = NFS_I(lo->plh_inode);
133 if (nfs_compare_fh(fh, &nfsi->fh))
131 continue; 134 continue;
132 if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) 135 if (nfsi->layout != lo)
133 continue; 136 continue;
134 ino = igrab(lo->plh_inode); 137 ino = igrab(lo->plh_inode);
135 if (!ino) 138 if (!ino)
136 break; 139 break;
137 spin_lock(&ino->i_lock); 140 spin_lock(&ino->i_lock);
138 /* Is this layout in the process of being freed? */ 141 /* Is this layout in the process of being freed? */
139 if (NFS_I(ino)->layout != lo) { 142 if (nfsi->layout != lo) {
140 spin_unlock(&ino->i_lock); 143 spin_unlock(&ino->i_lock);
141 iput(ino); 144 iput(ino);
142 break; 145 goto restart;
143 } 146 }
144 pnfs_get_layout_hdr(lo); 147 pnfs_get_layout_hdr(lo);
145 spin_unlock(&ino->i_lock); 148 spin_unlock(&ino->i_lock);
@@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
151} 154}
152 155
153static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, 156static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
154 struct nfs_fh *fh, nfs4_stateid *stateid) 157 struct nfs_fh *fh)
155{ 158{
156 struct pnfs_layout_hdr *lo; 159 struct pnfs_layout_hdr *lo;
157 160
158 spin_lock(&clp->cl_lock); 161 spin_lock(&clp->cl_lock);
159 rcu_read_lock(); 162 rcu_read_lock();
160 lo = get_layout_by_fh_locked(clp, fh, stateid); 163 lo = get_layout_by_fh_locked(clp, fh);
161 rcu_read_unlock(); 164 rcu_read_unlock();
162 spin_unlock(&clp->cl_lock); 165 spin_unlock(&clp->cl_lock);
163 166
@@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
167/* 170/*
168 * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) 171 * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
169 */ 172 */
170static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, 173static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
171 const nfs4_stateid *new) 174 const nfs4_stateid *new)
172{ 175{
173 u32 oldseq, newseq; 176 u32 oldseq, newseq;
174 177
175 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 178 /* Is the stateid still not initialised? */
179 if (!pnfs_layout_is_valid(lo))
180 return NFS4ERR_DELAY;
181
182 /* Mismatched stateid? */
183 if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
184 return NFS4ERR_BAD_STATEID;
185
176 newseq = be32_to_cpu(new->seqid); 186 newseq = be32_to_cpu(new->seqid);
187 /* Are we already in a layout recall situation? */
188 if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
189 lo->plh_return_seq != 0) {
190 if (newseq < lo->plh_return_seq)
191 return NFS4ERR_OLD_STATEID;
192 if (newseq > lo->plh_return_seq)
193 return NFS4ERR_DELAY;
194 goto out;
195 }
177 196
197 /* Check that the stateid matches what we think it should be. */
198 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
178 if (newseq > oldseq + 1) 199 if (newseq > oldseq + 1)
179 return false; 200 return NFS4ERR_DELAY;
180 return true; 201 /* Crazy server! */
202 if (newseq <= oldseq)
203 return NFS4ERR_OLD_STATEID;
204out:
205 return NFS_OK;
181} 206}
182 207
183static u32 initiate_file_draining(struct nfs_client *clp, 208static u32 initiate_file_draining(struct nfs_client *clp,
@@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
188 u32 rv = NFS4ERR_NOMATCHING_LAYOUT; 213 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
189 LIST_HEAD(free_me_list); 214 LIST_HEAD(free_me_list);
190 215
191 lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); 216 lo = get_layout_by_fh(clp, &args->cbl_fh);
192 if (!lo) { 217 if (!lo) {
193 trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, 218 trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
194 &args->cbl_stateid, -rv); 219 &args->cbl_stateid, -rv);
@@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
196 } 221 }
197 222
198 ino = lo->plh_inode; 223 ino = lo->plh_inode;
224 pnfs_layoutcommit_inode(ino, false);
225
199 226
200 spin_lock(&ino->i_lock); 227 spin_lock(&ino->i_lock);
201 if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { 228 rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
202 rv = NFS4ERR_DELAY; 229 if (rv != NFS_OK)
203 goto unlock; 230 goto unlock;
204 }
205 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 231 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
206 spin_unlock(&ino->i_lock);
207
208 pnfs_layoutcommit_inode(ino, false);
209 232
210 spin_lock(&ino->i_lock);
211 /* 233 /*
212 * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) 234 * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
213 */ 235 */
@@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
223 goto unlock; 245 goto unlock;
224 } 246 }
225 247
248 /* Embrace your forgetfulness! */
249 rv = NFS4ERR_NOMATCHING_LAYOUT;
250
226 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { 251 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
227 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, 252 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
228 &args->cbl_range); 253 &args->cbl_range);
229 } 254 }
230 pnfs_mark_layout_returned_if_empty(lo);
231unlock: 255unlock:
232 spin_unlock(&ino->i_lock); 256 spin_unlock(&ino->i_lock);
233 pnfs_free_lseg_list(&free_me_list); 257 pnfs_free_lseg_list(&free_me_list);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d81f96aacd51..656f68f7fe53 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
925 if (hdr_arg.minorversion == 0) { 925 if (hdr_arg.minorversion == 0) {
926 cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); 926 cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
927 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) 927 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
928 return rpc_drop_reply; 928 goto out_invalidcred;
929 } 929 }
930 930
931 cps.minorversion = hdr_arg.minorversion; 931 cps.minorversion = hdr_arg.minorversion;
@@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
953 nfs_put_client(cps.clp); 953 nfs_put_client(cps.clp);
954 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 954 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
955 return rpc_success; 955 return rpc_success;
956
957out_invalidcred:
958 pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
959 return rpc_autherr_badcred;
956} 960}
957 961
958/* 962/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 487c5607d52f..003ebce4bbc4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
367 */ 367 */
368struct nfs_client * 368struct nfs_client *
369nfs_get_client(const struct nfs_client_initdata *cl_init, 369nfs_get_client(const struct nfs_client_initdata *cl_init,
370 const struct rpc_timeout *timeparms,
371 const char *ip_addr,
372 rpc_authflavor_t authflavour) 370 rpc_authflavor_t authflavour)
373{ 371{
374 struct nfs_client *clp, *new = NULL; 372 struct nfs_client *clp, *new = NULL;
@@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
399 &nn->nfs_client_list); 397 &nn->nfs_client_list);
400 spin_unlock(&nn->nfs_client_lock); 398 spin_unlock(&nn->nfs_client_lock);
401 new->cl_flags = cl_init->init_flags; 399 new->cl_flags = cl_init->init_flags;
402 return rpc_ops->init_client(new, timeparms, ip_addr); 400 return rpc_ops->init_client(new, cl_init);
403 } 401 }
404 402
405 spin_unlock(&nn->nfs_client_lock); 403 spin_unlock(&nn->nfs_client_lock);
@@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
470 * Create an RPC client handle 468 * Create an RPC client handle
471 */ 469 */
472int nfs_create_rpc_client(struct nfs_client *clp, 470int nfs_create_rpc_client(struct nfs_client *clp,
473 const struct rpc_timeout *timeparms, 471 const struct nfs_client_initdata *cl_init,
474 rpc_authflavor_t flavor) 472 rpc_authflavor_t flavor)
475{ 473{
476 struct rpc_clnt *clnt = NULL; 474 struct rpc_clnt *clnt = NULL;
@@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp,
479 .protocol = clp->cl_proto, 477 .protocol = clp->cl_proto,
480 .address = (struct sockaddr *)&clp->cl_addr, 478 .address = (struct sockaddr *)&clp->cl_addr,
481 .addrsize = clp->cl_addrlen, 479 .addrsize = clp->cl_addrlen,
482 .timeout = timeparms, 480 .timeout = cl_init->timeparms,
483 .servername = clp->cl_hostname, 481 .servername = clp->cl_hostname,
482 .nodename = cl_init->nodename,
484 .program = &nfs_program, 483 .program = &nfs_program,
485 .version = clp->rpc_ops->version, 484 .version = clp->rpc_ops->version,
486 .authflavor = flavor, 485 .authflavor = flavor,
@@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
591 * nfs_init_client - Initialise an NFS2 or NFS3 client 590 * nfs_init_client - Initialise an NFS2 or NFS3 client
592 * 591 *
593 * @clp: nfs_client to initialise 592 * @clp: nfs_client to initialise
594 * @timeparms: timeout parameters for underlying RPC transport 593 * @cl_init: Initialisation parameters
595 * @ip_addr: IP presentation address (not used)
596 * 594 *
597 * Returns pointer to an NFS client, or an ERR_PTR value. 595 * Returns pointer to an NFS client, or an ERR_PTR value.
598 */ 596 */
599struct nfs_client *nfs_init_client(struct nfs_client *clp, 597struct nfs_client *nfs_init_client(struct nfs_client *clp,
600 const struct rpc_timeout *timeparms, 598 const struct nfs_client_initdata *cl_init)
601 const char *ip_addr)
602{ 599{
603 int error; 600 int error;
604 601
@@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
612 * Create a client RPC handle for doing FSSTAT with UNIX auth only 609 * Create a client RPC handle for doing FSSTAT with UNIX auth only
613 * - RFC 2623, sec 2.3.2 610 * - RFC 2623, sec 2.3.2
614 */ 611 */
615 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); 612 error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
616 if (error < 0) 613 if (error < 0)
617 goto error; 614 goto error;
618 nfs_mark_client_ready(clp, NFS_CS_READY); 615 nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server,
633 const struct nfs_parsed_mount_data *data, 630 const struct nfs_parsed_mount_data *data,
634 struct nfs_subversion *nfs_mod) 631 struct nfs_subversion *nfs_mod)
635{ 632{
633 struct rpc_timeout timeparms;
636 struct nfs_client_initdata cl_init = { 634 struct nfs_client_initdata cl_init = {
637 .hostname = data->nfs_server.hostname, 635 .hostname = data->nfs_server.hostname,
638 .addr = (const struct sockaddr *)&data->nfs_server.address, 636 .addr = (const struct sockaddr *)&data->nfs_server.address,
@@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server,
640 .nfs_mod = nfs_mod, 638 .nfs_mod = nfs_mod,
641 .proto = data->nfs_server.protocol, 639 .proto = data->nfs_server.protocol,
642 .net = data->net, 640 .net = data->net,
641 .timeparms = &timeparms,
643 }; 642 };
644 struct rpc_timeout timeparms;
645 struct nfs_client *clp; 643 struct nfs_client *clp;
646 int error; 644 int error;
647 645
@@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server,
653 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 651 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
654 652
655 /* Allocate or find a client reference we can use */ 653 /* Allocate or find a client reference we can use */
656 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); 654 clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
657 if (IS_ERR(clp)) { 655 if (IS_ERR(clp)) {
658 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 656 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
659 return PTR_ERR(clp); 657 return PTR_ERR(clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index baaa38859899..177fefb26c18 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2252,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
2252 return NULL; 2252 return NULL;
2253} 2253}
2254 2254
2255static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) 2255static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
2256{ 2256{
2257 struct nfs_inode *nfsi = NFS_I(inode); 2257 struct nfs_inode *nfsi = NFS_I(inode);
2258 struct nfs_access_entry *cache; 2258 struct nfs_access_entry *cache;
2259 int err = -ENOENT; 2259 bool retry = true;
2260 int err;
2260 2261
2261 spin_lock(&inode->i_lock); 2262 spin_lock(&inode->i_lock);
2262 if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) 2263 for(;;) {
2263 goto out_zap; 2264 if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
2264 cache = nfs_access_search_rbtree(inode, cred); 2265 goto out_zap;
2265 if (cache == NULL) 2266 cache = nfs_access_search_rbtree(inode, cred);
2266 goto out; 2267 err = -ENOENT;
2267 if (!nfs_have_delegated_attributes(inode) && 2268 if (cache == NULL)
2268 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 2269 goto out;
2269 goto out_stale; 2270 /* Found an entry, is our attribute cache valid? */
2271 if (!nfs_attribute_cache_expired(inode) &&
2272 !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
2273 break;
2274 err = -ECHILD;
2275 if (!may_block)
2276 goto out;
2277 if (!retry)
2278 goto out_zap;
2279 spin_unlock(&inode->i_lock);
2280 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
2281 if (err)
2282 return err;
2283 spin_lock(&inode->i_lock);
2284 retry = false;
2285 }
2270 res->jiffies = cache->jiffies; 2286 res->jiffies = cache->jiffies;
2271 res->cred = cache->cred; 2287 res->cred = cache->cred;
2272 res->mask = cache->mask; 2288 res->mask = cache->mask;
@@ -2275,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
2275out: 2291out:
2276 spin_unlock(&inode->i_lock); 2292 spin_unlock(&inode->i_lock);
2277 return err; 2293 return err;
2278out_stale:
2279 rb_erase(&cache->rb_node, &nfsi->access_cache);
2280 list_del(&cache->lru);
2281 spin_unlock(&inode->i_lock);
2282 nfs_access_free_entry(cache);
2283 return -ENOENT;
2284out_zap: 2294out_zap:
2285 spin_unlock(&inode->i_lock); 2295 spin_unlock(&inode->i_lock);
2286 nfs_access_zap_cache(inode); 2296 nfs_access_zap_cache(inode);
@@ -2307,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
2307 cache = NULL; 2317 cache = NULL;
2308 if (cache == NULL) 2318 if (cache == NULL)
2309 goto out; 2319 goto out;
2310 if (!nfs_have_delegated_attributes(inode) && 2320 err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
2311 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 2321 if (err)
2312 goto out; 2322 goto out;
2313 res->jiffies = cache->jiffies; 2323 res->jiffies = cache->jiffies;
2314 res->cred = cache->cred; 2324 res->cred = cache->cred;
2315 res->mask = cache->mask; 2325 res->mask = cache->mask;
2316 err = 0;
2317out: 2326out:
2318 rcu_read_unlock(); 2327 rcu_read_unlock();
2319 return err; 2328 return err;
@@ -2402,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask);
2402static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) 2411static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2403{ 2412{
2404 struct nfs_access_entry cache; 2413 struct nfs_access_entry cache;
2414 bool may_block = (mask & MAY_NOT_BLOCK) == 0;
2405 int status; 2415 int status;
2406 2416
2407 trace_nfs_access_enter(inode); 2417 trace_nfs_access_enter(inode);
2408 2418
2409 status = nfs_access_get_cached_rcu(inode, cred, &cache); 2419 status = nfs_access_get_cached_rcu(inode, cred, &cache);
2410 if (status != 0) 2420 if (status != 0)
2411 status = nfs_access_get_cached(inode, cred, &cache); 2421 status = nfs_access_get_cached(inode, cred, &cache, may_block);
2412 if (status == 0) 2422 if (status == 0)
2413 goto out_cached; 2423 goto out_cached;
2414 2424
2415 status = -ECHILD; 2425 status = -ECHILD;
2416 if (mask & MAY_NOT_BLOCK) 2426 if (!may_block)
2417 goto out; 2427 goto out;
2418 2428
2419 /* Be clever: ask server to check for all possible rights */ 2429 /* Be clever: ask server to check for all possible rights */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6210ead71d0..72b7d13ee3c6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
196 WARN_ON_ONCE(verfp->committed < 0); 196 WARN_ON_ONCE(verfp->committed < 0);
197} 197}
198 198
199static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
200 const struct nfs_writeverf *v2)
201{
202 return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
203}
204
199/* 205/*
200 * nfs_direct_cmp_hdr_verf - compare verifier for pgio header 206 * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
201 * @dreq - direct request possibly spanning multiple servers 207 * @dreq - direct request possibly spanning multiple servers
@@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
215 nfs_direct_set_hdr_verf(dreq, hdr); 221 nfs_direct_set_hdr_verf(dreq, hdr);
216 return 0; 222 return 0;
217 } 223 }
218 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 224 return nfs_direct_cmp_verf(verfp, &hdr->verf);
219} 225}
220 226
221/* 227/*
@@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
238 if (verfp->committed < 0) 244 if (verfp->committed < 0)
239 return 1; 245 return 1;
240 246
241 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 247 return nfs_direct_cmp_verf(verfp, &data->verf);
242} 248}
243 249
244/** 250/**
@@ -366,22 +372,10 @@ out:
366 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust 372 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
367 * the iocb is still valid here if this is a synchronous request. 373 * the iocb is still valid here if this is a synchronous request.
368 */ 374 */
369static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) 375static void nfs_direct_complete(struct nfs_direct_req *dreq)
370{ 376{
371 struct inode *inode = dreq->inode; 377 struct inode *inode = dreq->inode;
372 378
373 if (dreq->iocb && write) {
374 loff_t pos = dreq->iocb->ki_pos + dreq->count;
375
376 spin_lock(&inode->i_lock);
377 if (i_size_read(inode) < pos)
378 i_size_write(inode, pos);
379 spin_unlock(&inode->i_lock);
380 }
381
382 if (write)
383 nfs_zap_mapping(inode, inode->i_mapping);
384
385 inode_dio_end(inode); 379 inode_dio_end(inode);
386 380
387 if (dreq->iocb) { 381 if (dreq->iocb) {
@@ -436,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
436 } 430 }
437out_put: 431out_put:
438 if (put_dreq(dreq)) 432 if (put_dreq(dreq))
439 nfs_direct_complete(dreq, false); 433 nfs_direct_complete(dreq);
440 hdr->release(hdr); 434 hdr->release(hdr);
441} 435}
442 436
@@ -542,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
542 } 536 }
543 537
544 if (put_dreq(dreq)) 538 if (put_dreq(dreq))
545 nfs_direct_complete(dreq, false); 539 nfs_direct_complete(dreq);
546 return 0; 540 return 0;
547} 541}
548 542
@@ -583,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
583 if (!count) 577 if (!count)
584 goto out; 578 goto out;
585 579
586 inode_lock(inode);
587 result = nfs_sync_mapping(mapping);
588 if (result)
589 goto out_unlock;
590
591 task_io_account_read(count); 580 task_io_account_read(count);
592 581
593 result = -ENOMEM; 582 result = -ENOMEM;
594 dreq = nfs_direct_req_alloc(); 583 dreq = nfs_direct_req_alloc();
595 if (dreq == NULL) 584 if (dreq == NULL)
596 goto out_unlock; 585 goto out;
597 586
598 dreq->inode = inode; 587 dreq->inode = inode;
599 dreq->bytes_left = dreq->max_count = count; 588 dreq->bytes_left = dreq->max_count = count;
@@ -608,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
608 if (!is_sync_kiocb(iocb)) 597 if (!is_sync_kiocb(iocb))
609 dreq->iocb = iocb; 598 dreq->iocb = iocb;
610 599
600 nfs_start_io_direct(inode);
601
611 NFS_I(inode)->read_io += count; 602 NFS_I(inode)->read_io += count;
612 result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); 603 result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
613 604
614 inode_unlock(inode); 605 nfs_end_io_direct(inode);
615 606
616 if (!result) { 607 if (!result) {
617 result = nfs_direct_wait(dreq); 608 result = nfs_direct_wait(dreq);
@@ -619,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
619 iocb->ki_pos += result; 610 iocb->ki_pos += result;
620 } 611 }
621 612
622 nfs_direct_req_release(dreq);
623 return result;
624
625out_release: 613out_release:
626 nfs_direct_req_release(dreq); 614 nfs_direct_req_release(dreq);
627out_unlock:
628 inode_unlock(inode);
629out: 615out:
630 return result; 616 return result;
631} 617}
@@ -657,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
657 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); 643 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
658 644
659 dreq->count = 0; 645 dreq->count = 0;
646 dreq->verf.committed = NFS_INVALID_STABLE_HOW;
647 nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
660 for (i = 0; i < dreq->mirror_count; i++) 648 for (i = 0; i < dreq->mirror_count; i++)
661 dreq->mirrors[i].count = 0; 649 dreq->mirrors[i].count = 0;
662 get_dreq(dreq); 650 get_dreq(dreq);
@@ -775,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
775 nfs_direct_write_reschedule(dreq); 763 nfs_direct_write_reschedule(dreq);
776 break; 764 break;
777 default: 765 default:
778 nfs_direct_complete(dreq, true); 766 nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
767 nfs_direct_complete(dreq);
779 } 768 }
780} 769}
781 770
@@ -991,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
991ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) 980ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
992{ 981{
993 ssize_t result = -EINVAL; 982 ssize_t result = -EINVAL;
983 size_t count;
994 struct file *file = iocb->ki_filp; 984 struct file *file = iocb->ki_filp;
995 struct address_space *mapping = file->f_mapping; 985 struct address_space *mapping = file->f_mapping;
996 struct inode *inode = mapping->host; 986 struct inode *inode = mapping->host;
@@ -1001,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1001 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", 991 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
1002 file, iov_iter_count(iter), (long long) iocb->ki_pos); 992 file, iov_iter_count(iter), (long long) iocb->ki_pos);
1003 993
1004 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, 994 result = generic_write_checks(iocb, iter);
1005 iov_iter_count(iter)); 995 if (result <= 0)
996 return result;
997 count = result;
998 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
1006 999
1007 pos = iocb->ki_pos; 1000 pos = iocb->ki_pos;
1008 end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; 1001 end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
1009 1002
1010 inode_lock(inode); 1003 task_io_account_write(count);
1011
1012 result = nfs_sync_mapping(mapping);
1013 if (result)
1014 goto out_unlock;
1015
1016 if (mapping->nrpages) {
1017 result = invalidate_inode_pages2_range(mapping,
1018 pos >> PAGE_SHIFT, end);
1019 if (result)
1020 goto out_unlock;
1021 }
1022
1023 task_io_account_write(iov_iter_count(iter));
1024 1004
1025 result = -ENOMEM; 1005 result = -ENOMEM;
1026 dreq = nfs_direct_req_alloc(); 1006 dreq = nfs_direct_req_alloc();
1027 if (!dreq) 1007 if (!dreq)
1028 goto out_unlock; 1008 goto out;
1029 1009
1030 dreq->inode = inode; 1010 dreq->inode = inode;
1031 dreq->bytes_left = dreq->max_count = iov_iter_count(iter); 1011 dreq->bytes_left = dreq->max_count = count;
1032 dreq->io_start = pos; 1012 dreq->io_start = pos;
1033 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 1013 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
1034 l_ctx = nfs_get_lock_context(dreq->ctx); 1014 l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -1040,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1040 if (!is_sync_kiocb(iocb)) 1020 if (!is_sync_kiocb(iocb))
1041 dreq->iocb = iocb; 1021 dreq->iocb = iocb;
1042 1022
1023 nfs_start_io_direct(inode);
1024
1043 result = nfs_direct_write_schedule_iovec(dreq, iter, pos); 1025 result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
1044 1026
1045 if (mapping->nrpages) { 1027 if (mapping->nrpages) {
@@ -1047,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1047 pos >> PAGE_SHIFT, end); 1029 pos >> PAGE_SHIFT, end);
1048 } 1030 }
1049 1031
1050 inode_unlock(inode); 1032 nfs_end_io_direct(inode);
1051 1033
1052 if (!result) { 1034 if (!result) {
1053 result = nfs_direct_wait(dreq); 1035 result = nfs_direct_wait(dreq);
1054 if (result > 0) { 1036 if (result > 0) {
1055 struct inode *inode = mapping->host;
1056
1057 iocb->ki_pos = pos + result; 1037 iocb->ki_pos = pos + result;
1058 spin_lock(&inode->i_lock);
1059 if (i_size_read(inode) < iocb->ki_pos)
1060 i_size_write(inode, iocb->ki_pos);
1061 spin_unlock(&inode->i_lock);
1062
1063 /* XXX: should check the generic_write_sync retval */ 1038 /* XXX: should check the generic_write_sync retval */
1064 generic_write_sync(iocb, result); 1039 generic_write_sync(iocb, result);
1065 } 1040 }
1066 } 1041 }
1067 nfs_direct_req_release(dreq);
1068 return result;
1069
1070out_release: 1042out_release:
1071 nfs_direct_req_release(dreq); 1043 nfs_direct_req_release(dreq);
1072out_unlock: 1044out:
1073 inode_unlock(inode);
1074 return result; 1045 return result;
1075} 1046}
1076 1047
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 717a8d6af52d..7d620970f2e1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
170 iocb->ki_filp, 170 iocb->ki_filp,
171 iov_iter_count(to), (unsigned long) iocb->ki_pos); 171 iov_iter_count(to), (unsigned long) iocb->ki_pos);
172 172
173 result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); 173 nfs_start_io_read(inode);
174 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
174 if (!result) { 175 if (!result) {
175 result = generic_file_read_iter(iocb, to); 176 result = generic_file_read_iter(iocb, to);
176 if (result > 0) 177 if (result > 0)
177 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); 178 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
178 } 179 }
180 nfs_end_io_read(inode);
179 return result; 181 return result;
180} 182}
181EXPORT_SYMBOL_GPL(nfs_file_read); 183EXPORT_SYMBOL_GPL(nfs_file_read);
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
191 dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", 193 dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
192 filp, (unsigned long) count, (unsigned long long) *ppos); 194 filp, (unsigned long) count, (unsigned long long) *ppos);
193 195
194 res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); 196 nfs_start_io_read(inode);
197 res = nfs_revalidate_mapping(inode, filp->f_mapping);
195 if (!res) { 198 if (!res) {
196 res = generic_file_splice_read(filp, ppos, pipe, count, flags); 199 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
197 if (res > 0) 200 if (res > 0)
198 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); 201 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
199 } 202 }
203 nfs_end_io_read(inode);
200 return res; 204 return res;
201} 205}
202EXPORT_SYMBOL_GPL(nfs_file_splice_read); 206EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
272 276
273 trace_nfs_fsync_enter(inode); 277 trace_nfs_fsync_enter(inode);
274 278
275 inode_dio_wait(inode);
276 do { 279 do {
277 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 280 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
278 if (ret != 0) 281 if (ret != 0)
279 break; 282 break;
280 inode_lock(inode);
281 ret = nfs_file_fsync_commit(file, start, end, datasync); 283 ret = nfs_file_fsync_commit(file, start, end, datasync);
282 if (!ret) 284 if (!ret)
283 ret = pnfs_sync_inode(inode, !!datasync); 285 ret = pnfs_sync_inode(inode, !!datasync);
284 inode_unlock(inode);
285 /* 286 /*
286 * If nfs_file_fsync_commit detected a server reboot, then 287 * If nfs_file_fsync_commit detected a server reboot, then
287 * resend all dirty pages that might have been covered by 288 * resend all dirty pages that might have been covered by
@@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
359 file, mapping->host->i_ino, len, (long long) pos); 360 file, mapping->host->i_ino, len, (long long) pos);
360 361
361start: 362start:
362 /*
363 * Prevent starvation issues if someone is doing a consistency
364 * sync-to-disk
365 */
366 ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
367 nfs_wait_bit_killable, TASK_KILLABLE);
368 if (ret)
369 return ret;
370 /*
371 * Wait for O_DIRECT to complete
372 */
373 inode_dio_wait(mapping->host);
374
375 page = grab_cache_page_write_begin(mapping, index, flags); 363 page = grab_cache_page_write_begin(mapping, index, flags);
376 if (!page) 364 if (!page)
377 return -ENOMEM; 365 return -ENOMEM;
@@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
432 return status; 420 return status;
433 NFS_I(mapping->host)->write_io += copied; 421 NFS_I(mapping->host)->write_io += copied;
434 422
435 if (nfs_ctx_key_to_expire(ctx)) { 423 if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
436 status = nfs_wb_all(mapping->host); 424 status = nfs_wb_all(mapping->host);
437 if (status < 0) 425 if (status < 0)
438 return status; 426 return status;
@@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
470 */ 458 */
471static int nfs_release_page(struct page *page, gfp_t gfp) 459static int nfs_release_page(struct page *page, gfp_t gfp)
472{ 460{
473 struct address_space *mapping = page->mapping;
474
475 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 461 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
476 462
477 /* Always try to initiate a 'commit' if relevant, but only
478 * wait for it if the caller allows blocking. Even then,
479 * only wait 1 second and only if the 'bdi' is not congested.
480 * Waiting indefinitely can cause deadlocks when the NFS
481 * server is on this machine, when a new TCP connection is
482 * needed and in other rare cases. There is no particular
483 * need to wait extensively here. A short wait has the
484 * benefit that someone else can worry about the freezer.
485 */
486 if (mapping) {
487 struct nfs_server *nfss = NFS_SERVER(mapping->host);
488 nfs_commit_inode(mapping->host, 0);
489 if (gfpflags_allow_blocking(gfp) &&
490 !bdi_write_congested(&nfss->backing_dev_info)) {
491 wait_on_page_bit_killable_timeout(page, PG_private,
492 HZ);
493 if (PagePrivate(page))
494 set_bdi_congested(&nfss->backing_dev_info,
495 BLK_RW_ASYNC);
496 }
497 }
498 /* If PagePrivate() is set, then the page is not freeable */ 463 /* If PagePrivate() is set, then the page is not freeable */
499 if (PagePrivate(page)) 464 if (PagePrivate(page))
500 return 0; 465 return 0;
@@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
604 filp, filp->f_mapping->host->i_ino, 569 filp, filp->f_mapping->host->i_ino,
605 (long long)page_offset(page)); 570 (long long)page_offset(page));
606 571
572 sb_start_pagefault(inode->i_sb);
573
607 /* make sure the cache has finished storing the page */ 574 /* make sure the cache has finished storing the page */
608 nfs_fscache_wait_on_page_write(NFS_I(inode), page); 575 nfs_fscache_wait_on_page_write(NFS_I(inode), page);
609 576
@@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
630out_unlock: 597out_unlock:
631 unlock_page(page); 598 unlock_page(page);
632out: 599out:
600 sb_end_pagefault(inode->i_sb);
633 return ret; 601 return ret;
634} 602}
635 603
@@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode)
645 613
646 ctx = nfs_file_open_context(filp); 614 ctx = nfs_file_open_context(filp);
647 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || 615 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
648 nfs_ctx_key_to_expire(ctx)) 616 nfs_ctx_key_to_expire(ctx, inode))
649 return 1; 617 return 1;
650 return 0; 618 return 0;
651} 619}
@@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
656 struct inode *inode = file_inode(file); 624 struct inode *inode = file_inode(file);
657 unsigned long written = 0; 625 unsigned long written = 0;
658 ssize_t result; 626 ssize_t result;
659 size_t count = iov_iter_count(from);
660 627
661 result = nfs_key_timeout_notify(file, inode); 628 result = nfs_key_timeout_notify(file, inode);
662 if (result) 629 if (result)
663 return result; 630 return result;
664 631
665 if (iocb->ki_flags & IOCB_DIRECT) { 632 if (iocb->ki_flags & IOCB_DIRECT)
666 result = generic_write_checks(iocb, from);
667 if (result <= 0)
668 return result;
669 return nfs_file_direct_write(iocb, from); 633 return nfs_file_direct_write(iocb, from);
670 }
671 634
672 dprintk("NFS: write(%pD2, %zu@%Ld)\n", 635 dprintk("NFS: write(%pD2, %zu@%Ld)\n",
673 file, count, (long long) iocb->ki_pos); 636 file, iov_iter_count(from), (long long) iocb->ki_pos);
674 637
675 result = -EBUSY;
676 if (IS_SWAPFILE(inode)) 638 if (IS_SWAPFILE(inode))
677 goto out_swapfile; 639 goto out_swapfile;
678 /* 640 /*
@@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
684 goto out; 646 goto out;
685 } 647 }
686 648
687 result = count; 649 nfs_start_io_write(inode);
688 if (!count) 650 result = generic_write_checks(iocb, from);
651 if (result > 0) {
652 current->backing_dev_info = inode_to_bdi(inode);
653 result = generic_perform_write(file, from, iocb->ki_pos);
654 current->backing_dev_info = NULL;
655 }
656 nfs_end_io_write(inode);
657 if (result <= 0)
689 goto out; 658 goto out;
690 659
691 result = generic_file_write_iter(iocb, from); 660 written = generic_write_sync(iocb, result);
692 if (result > 0) 661 iocb->ki_pos += written;
693 written = result;
694 662
695 /* Return error values */ 663 /* Return error values */
696 if (result >= 0 && nfs_need_check_write(file, inode)) { 664 if (nfs_need_check_write(file, inode)) {
697 int err = vfs_fsync(file, 0); 665 int err = vfs_fsync(file, 0);
698 if (err < 0) 666 if (err < 0)
699 result = err; 667 result = err;
700 } 668 }
701 if (result > 0) 669 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
702 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
703out: 670out:
704 return result; 671 return result;
705 672
706out_swapfile: 673out_swapfile:
707 printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); 674 printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
708 goto out; 675 return -EBUSY;
709} 676}
710EXPORT_SYMBOL_GPL(nfs_file_write); 677EXPORT_SYMBOL_GPL(nfs_file_write);
711 678
@@ -780,11 +747,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
780} 747}
781 748
782static int 749static int
783is_time_granular(struct timespec *ts) {
784 return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
785}
786
787static int
788do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) 750do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
789{ 751{
790 struct inode *inode = filp->f_mapping->host; 752 struct inode *inode = filp->f_mapping->host;
@@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
817 * This makes locking act as a cache coherency point. 779 * This makes locking act as a cache coherency point.
818 */ 780 */
819 nfs_sync_mapping(filp->f_mapping); 781 nfs_sync_mapping(filp->f_mapping);
820 if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { 782 if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
821 if (is_time_granular(&NFS_SERVER(inode)->time_delta)) 783 nfs_zap_mapping(inode, filp->f_mapping);
822 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
823 else
824 nfs_zap_caches(inode);
825 }
826out: 784out:
827 return status; 785 return status;
828} 786}
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index aa59757389dc..a3fc48ba4931 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task,
255static void 255static void
256filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) 256filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
257{ 257{
258 loff_t end_offs = 0;
258 259
259 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || 260 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
260 hdr->res.verf->committed != NFS_DATA_SYNC) 261 hdr->res.verf->committed == NFS_FILE_SYNC)
261 return; 262 return;
263 if (hdr->res.verf->committed == NFS_DATA_SYNC)
264 end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
262 265
263 pnfs_set_layoutcommit(hdr->inode, hdr->lseg, 266 /* Note: if the write is unstable, don't set end_offs until commit */
264 hdr->mds_offset + hdr->res.count); 267 pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
265 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 268 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
266 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 269 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
267} 270}
@@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task,
354 } 357 }
355 358
356 filelayout_set_layoutcommit(hdr); 359 filelayout_set_layoutcommit(hdr);
360
361 /* zero out the fattr */
362 hdr->fattr.valid = 0;
363 if (task->tk_status >= 0)
364 nfs_writeback_update_inode(hdr);
365
357 return 0; 366 return 0;
358} 367}
359 368
@@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
375 return -EAGAIN; 384 return -EAGAIN;
376 } 385 }
377 386
378 if (data->verf.committed == NFS_UNSTABLE) 387 pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
379 pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
380 388
381 return 0; 389 return 0;
382} 390}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0e8018bc9880..e6206eaf2bdf 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
1325 * we always send layoutcommit after DS writes. 1325 * we always send layoutcommit after DS writes.
1326 */ 1326 */
1327static void 1327static void
1328ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) 1328ff_layout_set_layoutcommit(struct inode *inode,
1329 struct pnfs_layout_segment *lseg,
1330 loff_t end_offset)
1329{ 1331{
1330 if (!ff_layout_need_layoutcommit(hdr->lseg)) 1332 if (!ff_layout_need_layoutcommit(lseg))
1331 return; 1333 return;
1332 1334
1333 pnfs_set_layoutcommit(hdr->inode, hdr->lseg, 1335 pnfs_set_layoutcommit(inode, lseg, end_offset);
1334 hdr->mds_offset + hdr->res.count); 1336 dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
1335 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 1337 (unsigned long long) NFS_I(inode)->layout->plh_lwb);
1336 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
1337} 1338}
1338 1339
1339static bool 1340static bool
@@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data)
1469static int ff_layout_write_done_cb(struct rpc_task *task, 1470static int ff_layout_write_done_cb(struct rpc_task *task,
1470 struct nfs_pgio_header *hdr) 1471 struct nfs_pgio_header *hdr)
1471{ 1472{
1473 loff_t end_offs = 0;
1472 int err; 1474 int err;
1473 1475
1474 trace_nfs4_pnfs_write(hdr, task->tk_status); 1476 trace_nfs4_pnfs_write(hdr, task->tk_status);
@@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
1494 1496
1495 if (hdr->res.verf->committed == NFS_FILE_SYNC || 1497 if (hdr->res.verf->committed == NFS_FILE_SYNC ||
1496 hdr->res.verf->committed == NFS_DATA_SYNC) 1498 hdr->res.verf->committed == NFS_DATA_SYNC)
1497 ff_layout_set_layoutcommit(hdr); 1499 end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
1500
1501 /* Note: if the write is unstable, don't set end_offs until commit */
1502 ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
1498 1503
1499 /* zero out fattr since we don't care DS attr at all */ 1504 /* zero out fattr since we don't care DS attr at all */
1500 hdr->fattr.valid = 0; 1505 hdr->fattr.valid = 0;
@@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
1530 return -EAGAIN; 1535 return -EAGAIN;
1531 } 1536 }
1532 1537
1533 if (data->verf.committed == NFS_UNSTABLE 1538 ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
1534 && ff_layout_need_layoutcommit(data->lseg))
1535 pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
1536 1539
1537 return 0; 1540 return 0;
1538} 1541}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index dda689d7a8a7..bf4ec5ecc97e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
662 trace_nfs_getattr_enter(inode); 662 trace_nfs_getattr_enter(inode);
663 /* Flush out writes to the server in order to update c/mtime. */ 663 /* Flush out writes to the server in order to update c/mtime. */
664 if (S_ISREG(inode->i_mode)) { 664 if (S_ISREG(inode->i_mode)) {
665 inode_lock(inode); 665 err = filemap_write_and_wait(inode->i_mapping);
666 err = nfs_sync_inode(inode);
667 inode_unlock(inode);
668 if (err) 666 if (err)
669 goto out; 667 goto out;
670 } 668 }
@@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
879 struct nfs_inode *nfsi = NFS_I(inode); 877 struct nfs_inode *nfsi = NFS_I(inode);
880 878
881 spin_lock(&inode->i_lock); 879 spin_lock(&inode->i_lock);
882 list_add(&ctx->list, &nfsi->open_files); 880 if (ctx->mode & FMODE_WRITE)
881 list_add(&ctx->list, &nfsi->open_files);
882 else
883 list_add_tail(&ctx->list, &nfsi->open_files);
883 spin_unlock(&inode->i_lock); 884 spin_unlock(&inode->i_lock);
884} 885}
885EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); 886EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
972 if (NFS_STALE(inode)) 973 if (NFS_STALE(inode))
973 goto out; 974 goto out;
974 975
976 /* pNFS: Attributes aren't updated until we layoutcommit */
977 if (S_ISREG(inode->i_mode)) {
978 status = pnfs_sync_inode(inode, false);
979 if (status)
980 goto out;
981 }
982
975 status = -ENOMEM; 983 status = -ENOMEM;
976 fattr = nfs_alloc_fattr(); 984 fattr = nfs_alloc_fattr();
977 if (fattr == NULL) 985 if (fattr == NULL)
@@ -1122,14 +1130,12 @@ out:
1122} 1130}
1123 1131
1124/** 1132/**
1125 * __nfs_revalidate_mapping - Revalidate the pagecache 1133 * nfs_revalidate_mapping - Revalidate the pagecache
1126 * @inode - pointer to host inode 1134 * @inode - pointer to host inode
1127 * @mapping - pointer to mapping 1135 * @mapping - pointer to mapping
1128 * @may_lock - take inode->i_mutex?
1129 */ 1136 */
1130static int __nfs_revalidate_mapping(struct inode *inode, 1137int nfs_revalidate_mapping(struct inode *inode,
1131 struct address_space *mapping, 1138 struct address_space *mapping)
1132 bool may_lock)
1133{ 1139{
1134 struct nfs_inode *nfsi = NFS_I(inode); 1140 struct nfs_inode *nfsi = NFS_I(inode);
1135 unsigned long *bitlock = &nfsi->flags; 1141 unsigned long *bitlock = &nfsi->flags;
@@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode,
1178 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 1184 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
1179 spin_unlock(&inode->i_lock); 1185 spin_unlock(&inode->i_lock);
1180 trace_nfs_invalidate_mapping_enter(inode); 1186 trace_nfs_invalidate_mapping_enter(inode);
1181 if (may_lock) { 1187 ret = nfs_invalidate_mapping(inode, mapping);
1182 inode_lock(inode);
1183 ret = nfs_invalidate_mapping(inode, mapping);
1184 inode_unlock(inode);
1185 } else
1186 ret = nfs_invalidate_mapping(inode, mapping);
1187 trace_nfs_invalidate_mapping_exit(inode, ret); 1188 trace_nfs_invalidate_mapping_exit(inode, ret);
1188 1189
1189 clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); 1190 clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1193,27 +1194,28 @@ out:
1193 return ret; 1194 return ret;
1194} 1195}
1195 1196
1196/** 1197static bool nfs_file_has_writers(struct nfs_inode *nfsi)
1197 * nfs_revalidate_mapping - Revalidate the pagecache
1198 * @inode - pointer to host inode
1199 * @mapping - pointer to mapping
1200 */
1201int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1202{ 1198{
1203 return __nfs_revalidate_mapping(inode, mapping, false); 1199 struct inode *inode = &nfsi->vfs_inode;
1200
1201 assert_spin_locked(&inode->i_lock);
1202
1203 if (!S_ISREG(inode->i_mode))
1204 return false;
1205 if (list_empty(&nfsi->open_files))
1206 return false;
1207 /* Note: This relies on nfsi->open_files being ordered with writers
1208 * being placed at the head of the list.
1209 * See nfs_inode_attach_open_context()
1210 */
1211 return (list_first_entry(&nfsi->open_files,
1212 struct nfs_open_context,
1213 list)->mode & FMODE_WRITE) == FMODE_WRITE;
1204} 1214}
1205 1215
1206/** 1216static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
1207 * nfs_revalidate_mapping_protected - Revalidate the pagecache
1208 * @inode - pointer to host inode
1209 * @mapping - pointer to mapping
1210 *
1211 * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
1212 * while invalidating the mapping.
1213 */
1214int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
1215{ 1217{
1216 return __nfs_revalidate_mapping(inode, mapping, true); 1218 return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
1217} 1219}
1218 1220
1219static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 1221static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1280 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) 1282 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
1281 return -EIO; 1283 return -EIO;
1282 1284
1283 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && 1285 if (!nfs_file_has_buffered_writers(nfsi)) {
1284 inode->i_version != fattr->change_attr) 1286 /* Verify a few of the more important attributes */
1285 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1287 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
1288 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
1286 1289
1287 /* Verify a few of the more important attributes */ 1290 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
1288 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) 1291 invalid |= NFS_INO_INVALID_ATTR;
1289 invalid |= NFS_INO_INVALID_ATTR;
1290 1292
1291 if (fattr->valid & NFS_ATTR_FATTR_SIZE) { 1293 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
1292 cur_size = i_size_read(inode); 1294 invalid |= NFS_INO_INVALID_ATTR;
1293 new_isize = nfs_size_to_loff_t(fattr->size); 1295
1294 if (cur_size != new_isize) 1296 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
1295 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1297 cur_size = i_size_read(inode);
1298 new_isize = nfs_size_to_loff_t(fattr->size);
1299 if (cur_size != new_isize)
1300 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1301 }
1296 } 1302 }
1297 if (nfsi->nrequests != 0)
1298 invalid &= ~NFS_INO_REVAL_PAGECACHE;
1299 1303
1300 /* Have any file permissions changed? */ 1304 /* Have any file permissions changed? */
1301 if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) 1305 if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
1470 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); 1474 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
1471} 1475}
1472 1476
1473/*
1474 * Don't trust the change_attribute, mtime, ctime or size if
1475 * a pnfs LAYOUTCOMMIT is outstanding
1476 */
1477static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
1478 struct nfs_fattr *fattr)
1479{
1480 if (pnfs_layoutcommit_outstanding(inode))
1481 fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
1482 NFS_ATTR_FATTR_MTIME |
1483 NFS_ATTR_FATTR_CTIME |
1484 NFS_ATTR_FATTR_SIZE);
1485}
1486
1487static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) 1477static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
1488{ 1478{
1489 int ret; 1479 int ret;
1490 1480
1491 trace_nfs_refresh_inode_enter(inode); 1481 trace_nfs_refresh_inode_enter(inode);
1492 1482
1493 nfs_inode_attrs_handle_layoutcommit(inode, fattr);
1494
1495 if (nfs_inode_attrs_need_update(inode, fattr)) 1483 if (nfs_inode_attrs_need_update(inode, fattr))
1496 ret = nfs_update_inode(inode, fattr); 1484 ret = nfs_update_inode(inode, fattr);
1497 else 1485 else
@@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
1527 1515
1528static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) 1516static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
1529{ 1517{
1530 unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1518 unsigned long invalid = NFS_INO_INVALID_ATTR;
1531 1519
1532 /* 1520 /*
1533 * Don't revalidate the pagecache if we hold a delegation, but do 1521 * Don't revalidate the pagecache if we hold a delegation, but do
@@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1676 unsigned long invalid = 0; 1664 unsigned long invalid = 0;
1677 unsigned long now = jiffies; 1665 unsigned long now = jiffies;
1678 unsigned long save_cache_validity; 1666 unsigned long save_cache_validity;
1667 bool have_writers = nfs_file_has_buffered_writers(nfsi);
1679 bool cache_revalidated = true; 1668 bool cache_revalidated = true;
1680 1669
1681 dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", 1670 dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
@@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1725 /* Do atomic weak cache consistency updates */ 1714 /* Do atomic weak cache consistency updates */
1726 invalid |= nfs_wcc_update_inode(inode, fattr); 1715 invalid |= nfs_wcc_update_inode(inode, fattr);
1727 1716
1717 if (pnfs_layoutcommit_outstanding(inode)) {
1718 nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
1719 cache_revalidated = false;
1720 }
1721
1728 /* More cache consistency checks */ 1722 /* More cache consistency checks */
1729 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1723 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
1730 if (inode->i_version != fattr->change_attr) { 1724 if (inode->i_version != fattr->change_attr) {
1731 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1725 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1732 inode->i_sb->s_id, inode->i_ino); 1726 inode->i_sb->s_id, inode->i_ino);
1733 invalid |= NFS_INO_INVALID_ATTR 1727 /* Could it be a race with writeback? */
1734 | NFS_INO_INVALID_DATA 1728 if (!have_writers) {
1735 | NFS_INO_INVALID_ACCESS 1729 invalid |= NFS_INO_INVALID_ATTR
1736 | NFS_INO_INVALID_ACL; 1730 | NFS_INO_INVALID_DATA
1737 if (S_ISDIR(inode->i_mode)) 1731 | NFS_INO_INVALID_ACCESS
1738 nfs_force_lookup_revalidate(inode); 1732 | NFS_INO_INVALID_ACL;
1733 if (S_ISDIR(inode->i_mode))
1734 nfs_force_lookup_revalidate(inode);
1735 }
1739 inode->i_version = fattr->change_attr; 1736 inode->i_version = fattr->change_attr;
1740 } 1737 }
1741 } else { 1738 } else {
@@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1768 if (new_isize != cur_isize) { 1765 if (new_isize != cur_isize) {
1769 /* Do we perhaps have any outstanding writes, or has 1766 /* Do we perhaps have any outstanding writes, or has
1770 * the file grown beyond our last write? */ 1767 * the file grown beyond our last write? */
1771 if ((nfsi->nrequests == 0) || new_isize > cur_isize) { 1768 if (nfsi->nrequests == 0 || new_isize > cur_isize) {
1772 i_size_write(inode, new_isize); 1769 i_size_write(inode, new_isize);
1773 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1770 if (!have_writers)
1771 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1774 } 1772 }
1775 dprintk("NFS: isize change on server for file %s/%ld " 1773 dprintk("NFS: isize change on server for file %s/%ld "
1776 "(%Ld to %Ld)\n", 1774 "(%Ld to %Ld)\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5ea04d87fc65..7ce5e023c3c3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -66,13 +66,16 @@ struct nfs_clone_mount {
66 66
67struct nfs_client_initdata { 67struct nfs_client_initdata {
68 unsigned long init_flags; 68 unsigned long init_flags;
69 const char *hostname; 69 const char *hostname; /* Hostname of the server */
70 const struct sockaddr *addr; 70 const struct sockaddr *addr; /* Address of the server */
71 const char *nodename; /* Hostname of the client */
72 const char *ip_addr; /* IP address of the client */
71 size_t addrlen; 73 size_t addrlen;
72 struct nfs_subversion *nfs_mod; 74 struct nfs_subversion *nfs_mod;
73 int proto; 75 int proto;
74 u32 minorversion; 76 u32 minorversion;
75 struct net *net; 77 struct net *net;
78 const struct rpc_timeout *timeparms;
76}; 79};
77 80
78/* 81/*
@@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info);
147extern const struct rpc_program nfs_program; 150extern const struct rpc_program nfs_program;
148extern void nfs_clients_init(struct net *net); 151extern void nfs_clients_init(struct net *net);
149extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); 152extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
150int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); 153int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
151struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, 154struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
152 const struct rpc_timeout *, const char *,
153 rpc_authflavor_t); 155 rpc_authflavor_t);
154int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); 156int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
155void nfs_server_insert_lists(struct nfs_server *); 157void nfs_server_insert_lists(struct nfs_server *);
@@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
184 rpc_authflavor_t); 186 rpc_authflavor_t);
185extern int nfs_wait_client_init_complete(const struct nfs_client *clp); 187extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
186extern void nfs_mark_client_ready(struct nfs_client *clp, int state); 188extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
187extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, 189extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
188 const struct sockaddr *ds_addr, 190 const struct sockaddr *ds_addr,
189 int ds_addrlen, int ds_proto, 191 int ds_addrlen, int ds_proto,
190 unsigned int ds_timeo, 192 unsigned int ds_timeo,
@@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
193 rpc_authflavor_t au_flavor); 195 rpc_authflavor_t au_flavor);
194extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, 196extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
195 struct inode *); 197 struct inode *);
196extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, 198extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
197 const struct sockaddr *ds_addr, int ds_addrlen, 199 const struct sockaddr *ds_addr, int ds_addrlen,
198 int ds_proto, unsigned int ds_timeo, 200 int ds_proto, unsigned int ds_timeo,
199 unsigned int ds_retrans, rpc_authflavor_t au_flavor); 201 unsigned int ds_retrans, rpc_authflavor_t au_flavor);
@@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
338/* proc.c */ 340/* proc.c */
339void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 341void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
340extern struct nfs_client *nfs_init_client(struct nfs_client *clp, 342extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
341 const struct rpc_timeout *timeparms, 343 const struct nfs_client_initdata *);
342 const char *ip_addr);
343 344
344/* dir.c */ 345/* dir.c */
345extern void nfs_force_use_readdirplus(struct inode *dir); 346extern void nfs_force_use_readdirplus(struct inode *dir);
@@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void);
411extern bool nfs_sb_active(struct super_block *sb); 412extern bool nfs_sb_active(struct super_block *sb);
412extern void nfs_sb_deactive(struct super_block *sb); 413extern void nfs_sb_deactive(struct super_block *sb);
413 414
415/* io.c */
416extern void nfs_start_io_read(struct inode *inode);
417extern void nfs_end_io_read(struct inode *inode);
418extern void nfs_start_io_write(struct inode *inode);
419extern void nfs_end_io_write(struct inode *inode);
420extern void nfs_start_io_direct(struct inode *inode);
421extern void nfs_end_io_direct(struct inode *inode);
422
423static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
424{
425 return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
426}
427
414/* namespace.c */ 428/* namespace.c */
415#define NFS_PATH_CANONICAL 1 429#define NFS_PATH_CANONICAL 1
416extern char *nfs_path(char **p, struct dentry *dentry, 430extern char *nfs_path(char **p, struct dentry *dentry,
@@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
496 struct inode *inode, 510 struct inode *inode,
497 struct nfs_direct_req *dreq); 511 struct nfs_direct_req *dreq);
498int nfs_key_timeout_notify(struct file *filp, struct inode *inode); 512int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
499bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); 513bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
500void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); 514void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
501 515
516int nfs_filemap_write_and_wait_range(struct address_space *mapping,
517 loff_t lstart, loff_t lend);
518
519#ifdef CONFIG_NFS_V4_1
520static inline
521void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
522{
523 int i;
524
525 for (i = 0; i < cinfo->nbuckets; i++)
526 cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
527}
528#else
529static inline
530void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
531{
532}
533#endif
534
535
502#ifdef CONFIG_MIGRATION 536#ifdef CONFIG_MIGRATION
503extern int nfs_migrate_page(struct address_space *, 537extern int nfs_migrate_page(struct address_space *,
504 struct page *, struct page *, enum migrate_mode); 538 struct page *, struct page *, enum migrate_mode);
@@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *,
506#define nfs_migrate_page NULL 540#define nfs_migrate_page NULL
507#endif 541#endif
508 542
543static inline int
544nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
545 const struct nfs_write_verifier *v2)
546{
547 return memcmp(v1->data, v2->data, sizeof(v1->data));
548}
549
509/* unlink.c */ 550/* unlink.c */
510extern struct rpc_task * 551extern struct rpc_task *
511nfs_async_rename(struct inode *old_dir, struct inode *new_dir, 552nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
521/* nfs4proc.c */ 562/* nfs4proc.c */
522extern void __nfs4_read_done_cb(struct nfs_pgio_header *); 563extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
523extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, 564extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
524 const struct rpc_timeout *timeparms, 565 const struct nfs_client_initdata *);
525 const char *ip_addr);
526extern int nfs40_walk_client_list(struct nfs_client *clp, 566extern int nfs40_walk_client_list(struct nfs_client *clp,
527 struct nfs_client **result, 567 struct nfs_client **result,
528 struct rpc_cred *cred); 568 struct rpc_cred *cred);
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 000000000000..1fc5d1ce327e
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (c) 2016 Trond Myklebust
3 *
4 * I/O and data path helper functionality.
5 */
6
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/bitops.h>
10#include <linux/rwsem.h>
11#include <linux/fs.h>
12#include <linux/nfs_fs.h>
13
14#include "internal.h"
15
16/* Call with exclusively locked inode->i_rwsem */
17static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
18{
19 if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
20 clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
21 inode_dio_wait(inode);
22 }
23}
24
25/**
26 * nfs_start_io_read - declare the file is being used for buffered reads
27 * @inode - file inode
28 *
29 * Declare that a buffered read operation is about to start, and ensure
30 * that we block all direct I/O.
31 * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
32 * and holds a shared lock on inode->i_rwsem to ensure that the flag
33 * cannot be changed.
34 * In practice, this means that buffered read operations are allowed to
35 * execute in parallel, thanks to the shared lock, whereas direct I/O
36 * operations need to wait to grab an exclusive lock in order to set
37 * NFS_INO_ODIRECT.
38 * Note that buffered writes and truncates both take a write lock on
39 * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
40 */
41void
42nfs_start_io_read(struct inode *inode)
43{
44 struct nfs_inode *nfsi = NFS_I(inode);
45 /* Be an optimist! */
46 down_read(&inode->i_rwsem);
47 if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
48 return;
49 up_read(&inode->i_rwsem);
50 /* Slow path.... */
51 down_write(&inode->i_rwsem);
52 nfs_block_o_direct(nfsi, inode);
53 downgrade_write(&inode->i_rwsem);
54}
55
56/**
57 * nfs_end_io_read - declare that the buffered read operation is done
58 * @inode - file inode
59 *
60 * Declare that a buffered read operation is done, and release the shared
61 * lock on inode->i_rwsem.
62 */
63void
64nfs_end_io_read(struct inode *inode)
65{
66 up_read(&inode->i_rwsem);
67}
68
69/**
70 * nfs_start_io_write - declare the file is being used for buffered writes
71 * @inode - file inode
72 *
73 * Declare that a buffered read operation is about to start, and ensure
74 * that we block all direct I/O.
75 */
76void
77nfs_start_io_write(struct inode *inode)
78{
79 down_write(&inode->i_rwsem);
80 nfs_block_o_direct(NFS_I(inode), inode);
81}
82
83/**
84 * nfs_end_io_write - declare that the buffered write operation is done
85 * @inode - file inode
86 *
87 * Declare that a buffered write operation is done, and release the
88 * lock on inode->i_rwsem.
89 */
90void
91nfs_end_io_write(struct inode *inode)
92{
93 up_write(&inode->i_rwsem);
94}
95
96/* Call with exclusively locked inode->i_rwsem */
97static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
98{
99 if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
100 set_bit(NFS_INO_ODIRECT, &nfsi->flags);
101 nfs_wb_all(inode);
102 }
103}
104
105/**
106 * nfs_end_io_direct - declare the file is being used for direct i/o
107 * @inode - file inode
108 *
109 * Declare that a direct I/O operation is about to start, and ensure
110 * that we block all buffered I/O.
111 * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
112 * and holds a shared lock on inode->i_rwsem to ensure that the flag
113 * cannot be changed.
114 * In practice, this means that direct I/O operations are allowed to
115 * execute in parallel, thanks to the shared lock, whereas buffered I/O
116 * operations need to wait to grab an exclusive lock in order to clear
117 * NFS_INO_ODIRECT.
118 * Note that buffered writes and truncates both take a write lock on
119 * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
120 */
121void
122nfs_start_io_direct(struct inode *inode)
123{
124 struct nfs_inode *nfsi = NFS_I(inode);
125 /* Be an optimist! */
126 down_read(&inode->i_rwsem);
127 if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
128 return;
129 up_read(&inode->i_rwsem);
130 /* Slow path.... */
131 down_write(&inode->i_rwsem);
132 nfs_block_buffered(nfsi, inode);
133 downgrade_write(&inode->i_rwsem);
134}
135
136/**
137 * nfs_end_io_direct - declare that the direct i/o operation is done
138 * @inode - file inode
139 *
140 * Declare that a direct I/O operation is done, and release the shared
141 * lock on inode->i_rwsem.
142 */
143void
144nfs_end_io_direct(struct inode *inode)
145{
146 up_read(&inode->i_rwsem);
147}
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 9e9fa347a948..ee753547fb0a 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
76 * low timeout interval so that if a connection is lost, we retry through 76 * low timeout interval so that if a connection is lost, we retry through
77 * the MDS. 77 * the MDS.
78 */ 78 */
79struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, 79struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
80 const struct sockaddr *ds_addr, int ds_addrlen, 80 const struct sockaddr *ds_addr, int ds_addrlen,
81 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, 81 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
82 rpc_authflavor_t au_flavor) 82 rpc_authflavor_t au_flavor)
83{ 83{
84 struct rpc_timeout ds_timeout;
85 struct nfs_client *mds_clp = mds_srv->nfs_client;
84 struct nfs_client_initdata cl_init = { 86 struct nfs_client_initdata cl_init = {
85 .addr = ds_addr, 87 .addr = ds_addr,
86 .addrlen = ds_addrlen, 88 .addrlen = ds_addrlen,
89 .nodename = mds_clp->cl_rpcclient->cl_nodename,
90 .ip_addr = mds_clp->cl_ipaddr,
87 .nfs_mod = &nfs_v3, 91 .nfs_mod = &nfs_v3,
88 .proto = ds_proto, 92 .proto = ds_proto,
89 .net = mds_clp->cl_net, 93 .net = mds_clp->cl_net,
94 .timeparms = &ds_timeout,
90 }; 95 };
91 struct rpc_timeout ds_timeout;
92 struct nfs_client *clp; 96 struct nfs_client *clp;
93 char buf[INET6_ADDRSTRLEN + 1]; 97 char buf[INET6_ADDRSTRLEN + 1];
94 98
@@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
97 return ERR_PTR(-EINVAL); 101 return ERR_PTR(-EINVAL);
98 cl_init.hostname = buf; 102 cl_init.hostname = buf;
99 103
104 if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
105 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
106
100 /* Use the MDS nfs_client cl_ipaddr. */ 107 /* Use the MDS nfs_client cl_ipaddr. */
101 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); 108 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
102 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, 109 clp = nfs_get_client(&cl_init, au_flavor);
103 au_flavor);
104 110
105 return clp; 111 return clp;
106} 112}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index aa03ed09ba06..33da841a21bb 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
113 if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) 113 if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
114 return -EOPNOTSUPP; 114 return -EOPNOTSUPP;
115 115
116 nfs_wb_all(inode);
117 inode_lock(inode); 116 inode_lock(inode);
117 err = nfs_sync_inode(inode);
118 if (err)
119 goto out_unlock;
118 120
119 err = nfs42_proc_fallocate(&msg, filep, offset, len); 121 err = nfs42_proc_fallocate(&msg, filep, offset, len);
120 if (err == 0) 122 if (err == 0)
121 truncate_pagecache_range(inode, offset, (offset + len) -1); 123 truncate_pagecache_range(inode, offset, (offset + len) -1);
122 if (err == -EOPNOTSUPP) 124 if (err == -EOPNOTSUPP)
123 NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; 125 NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
124 126out_unlock:
125 inode_unlock(inode); 127 inode_unlock(inode);
126 return err; 128 return err;
127} 129}
@@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
154 if (status) 156 if (status)
155 return status; 157 return status;
156 158
159 status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
160 pos_src, pos_src + (loff_t)count - 1);
161 if (status)
162 return status;
163
157 status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, 164 status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
158 dst_lock, FMODE_WRITE); 165 dst_lock, FMODE_WRITE);
159 if (status) 166 if (status)
160 return status; 167 return status;
161 168
169 status = nfs_sync_inode(dst_inode);
170 if (status)
171 return status;
172
162 status = nfs4_call_sync(server->client, server, &msg, 173 status = nfs4_call_sync(server->client, server, &msg,
163 &args.seq_args, &res.seq_res, 0); 174 &args.seq_args, &res.seq_res, 0);
164 if (status == -ENOTSUPP) 175 if (status == -ENOTSUPP)
@@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
258 if (status) 269 if (status)
259 return status; 270 return status;
260 271
261 nfs_wb_all(inode); 272 status = nfs_filemap_write_and_wait_range(inode->i_mapping,
273 offset, LLONG_MAX);
274 if (status)
275 return status;
276
262 status = nfs4_call_sync(server->client, server, &msg, 277 status = nfs4_call_sync(server->client, server, &msg,
263 &args.seq_args, &res.seq_res, 0); 278 &args.seq_args, &res.seq_res, 0);
264 if (status == -ENOTSUPP) 279 if (status == -ENOTSUPP)
@@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
336 * Mark the bad layout state as invalid, then retry 351 * Mark the bad layout state as invalid, then retry
337 * with the current stateid. 352 * with the current stateid.
338 */ 353 */
339 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 354 pnfs_mark_layout_stateid_invalid(lo, &head);
340 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
341 spin_unlock(&inode->i_lock); 355 spin_unlock(&inode->i_lock);
342 pnfs_free_lseg_list(&head); 356 pnfs_free_lseg_list(&head);
343 } else 357 } else
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6dc6f2aea0d6..8b2605882a20 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr,
330 struct nfs42_write_res *res) 330 struct nfs42_write_res *res)
331{ 331{
332 __be32 *p; 332 __be32 *p;
333 int stateids;
334 333
335 p = xdr_inline_decode(xdr, 4 + 8 + 4); 334 p = xdr_inline_decode(xdr, 4 + 8 + 4);
336 if (unlikely(!p)) 335 if (unlikely(!p))
337 goto out_overflow; 336 goto out_overflow;
338 337
339 stateids = be32_to_cpup(p++); 338 /*
339 * We never use asynchronous mode, so warn if a server returns
340 * a stateid.
341 */
342 if (unlikely(*p != 0)) {
343 pr_err_once("%s: server has set unrequested "
344 "asynchronous mode\n", __func__);
345 return -EREMOTEIO;
346 }
347 p++;
340 p = xdr_decode_hyper(p, &res->count); 348 p = xdr_decode_hyper(p, &res->count);
341 res->verifier.committed = be32_to_cpup(p); 349 res->verifier.committed = be32_to_cpup(p);
342 return decode_verifier(xdr, &res->verifier.verifier); 350 return decode_verifier(xdr, &res->verifier.verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 768456fa1b17..4be567a54958 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -185,6 +185,7 @@ struct nfs4_state {
185struct nfs4_exception { 185struct nfs4_exception {
186 struct nfs4_state *state; 186 struct nfs4_state *state;
187 struct inode *inode; 187 struct inode *inode;
188 nfs4_stateid *stateid;
188 long timeout; 189 long timeout;
189 unsigned char delay : 1, 190 unsigned char delay : 1,
190 recovering : 1, 191 recovering : 1,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 10410e8b5853..8d7d08d4f95f 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
349 * Returns pointer to an NFS client, or an ERR_PTR value. 349 * Returns pointer to an NFS client, or an ERR_PTR value.
350 */ 350 */
351struct nfs_client *nfs4_init_client(struct nfs_client *clp, 351struct nfs_client *nfs4_init_client(struct nfs_client *clp,
352 const struct rpc_timeout *timeparms, 352 const struct nfs_client_initdata *cl_init)
353 const char *ip_addr)
354{ 353{
355 char buf[INET6_ADDRSTRLEN + 1]; 354 char buf[INET6_ADDRSTRLEN + 1];
355 const char *ip_addr = cl_init->ip_addr;
356 struct nfs_client *old; 356 struct nfs_client *old;
357 int error; 357 int error;
358 358
@@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
370 __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); 370 __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
371 __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); 371 __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
372 372
373 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); 373 error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
374 if (error == -EINVAL) 374 if (error == -EINVAL)
375 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); 375 error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
376 if (error < 0) 376 if (error < 0)
377 goto error; 377 goto error;
378 378
@@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server,
793 .hostname = hostname, 793 .hostname = hostname,
794 .addr = addr, 794 .addr = addr,
795 .addrlen = addrlen, 795 .addrlen = addrlen,
796 .ip_addr = ip_addr,
796 .nfs_mod = &nfs_v4, 797 .nfs_mod = &nfs_v4,
797 .proto = proto, 798 .proto = proto,
798 .minorversion = minorversion, 799 .minorversion = minorversion,
799 .net = net, 800 .net = net,
801 .timeparms = timeparms,
800 }; 802 };
801 struct nfs_client *clp; 803 struct nfs_client *clp;
802 int error; 804 int error;
@@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server,
809 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); 811 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
810 812
811 /* Allocate or find a client reference we can use */ 813 /* Allocate or find a client reference we can use */
812 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); 814 clp = nfs_get_client(&cl_init, authflavour);
813 if (IS_ERR(clp)) { 815 if (IS_ERR(clp)) {
814 error = PTR_ERR(clp); 816 error = PTR_ERR(clp);
815 goto error; 817 goto error;
@@ -842,20 +844,24 @@ error:
842 * low timeout interval so that if a connection is lost, we retry through 844 * low timeout interval so that if a connection is lost, we retry through
843 * the MDS. 845 * the MDS.
844 */ 846 */
845struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, 847struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
846 const struct sockaddr *ds_addr, int ds_addrlen, 848 const struct sockaddr *ds_addr, int ds_addrlen,
847 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, 849 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
848 u32 minor_version, rpc_authflavor_t au_flavor) 850 u32 minor_version, rpc_authflavor_t au_flavor)
849{ 851{
852 struct rpc_timeout ds_timeout;
853 struct nfs_client *mds_clp = mds_srv->nfs_client;
850 struct nfs_client_initdata cl_init = { 854 struct nfs_client_initdata cl_init = {
851 .addr = ds_addr, 855 .addr = ds_addr,
852 .addrlen = ds_addrlen, 856 .addrlen = ds_addrlen,
857 .nodename = mds_clp->cl_rpcclient->cl_nodename,
858 .ip_addr = mds_clp->cl_ipaddr,
853 .nfs_mod = &nfs_v4, 859 .nfs_mod = &nfs_v4,
854 .proto = ds_proto, 860 .proto = ds_proto,
855 .minorversion = minor_version, 861 .minorversion = minor_version,
856 .net = mds_clp->cl_net, 862 .net = mds_clp->cl_net,
863 .timeparms = &ds_timeout,
857 }; 864 };
858 struct rpc_timeout ds_timeout;
859 struct nfs_client *clp; 865 struct nfs_client *clp;
860 char buf[INET6_ADDRSTRLEN + 1]; 866 char buf[INET6_ADDRSTRLEN + 1];
861 867
@@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
863 return ERR_PTR(-EINVAL); 869 return ERR_PTR(-EINVAL);
864 cl_init.hostname = buf; 870 cl_init.hostname = buf;
865 871
872 if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
873 __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
874
866 /* 875 /*
867 * Set an authflavor equual to the MDS value. Use the MDS nfs_client 876 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
868 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS 877 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
869 * (section 13.1 RFC 5661). 878 * (section 13.1 RFC 5661).
870 */ 879 */
871 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); 880 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
872 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, 881 clp = nfs_get_client(&cl_init, au_flavor);
873 au_flavor);
874 882
875 dprintk("<-- %s %p\n", __func__, clp); 883 dprintk("<-- %s %p\n", __func__, clp);
876 return clp; 884 return clp;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 014b0e41ace5..d085ad794884 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
66 if (openflags & O_TRUNC) { 66 if (openflags & O_TRUNC) {
67 attr.ia_valid |= ATTR_SIZE; 67 attr.ia_valid |= ATTR_SIZE;
68 attr.ia_size = 0; 68 attr.ia_size = 0;
69 nfs_sync_inode(inode); 69 filemap_write_and_wait(inode->i_mapping);
70 } 70 }
71 71
72 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); 72 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
@@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
133 struct file *file_out, loff_t pos_out, 133 struct file *file_out, loff_t pos_out,
134 size_t count, unsigned int flags) 134 size_t count, unsigned int flags)
135{ 135{
136 struct inode *in_inode = file_inode(file_in); 136 if (file_inode(file_in) == file_inode(file_out))
137 struct inode *out_inode = file_inode(file_out);
138 int ret;
139
140 if (in_inode == out_inode)
141 return -EINVAL; 137 return -EINVAL;
142 138
143 /* flush any pending writes */
144 ret = nfs_sync_inode(in_inode);
145 if (ret)
146 return ret;
147 ret = nfs_sync_inode(out_inode);
148 if (ret)
149 return ret;
150
151 return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); 139 return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
152} 140}
153 141
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ff416d0e24bc..da5c9e58e907 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
363{ 363{
364 struct nfs_client *clp = server->nfs_client; 364 struct nfs_client *clp = server->nfs_client;
365 struct nfs4_state *state = exception->state; 365 struct nfs4_state *state = exception->state;
366 const nfs4_stateid *stateid = exception->stateid;
366 struct inode *inode = exception->inode; 367 struct inode *inode = exception->inode;
367 int ret = errorcode; 368 int ret = errorcode;
368 369
@@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
376 case -NFS4ERR_DELEG_REVOKED: 377 case -NFS4ERR_DELEG_REVOKED:
377 case -NFS4ERR_ADMIN_REVOKED: 378 case -NFS4ERR_ADMIN_REVOKED:
378 case -NFS4ERR_BAD_STATEID: 379 case -NFS4ERR_BAD_STATEID:
379 if (inode && nfs_async_inode_return_delegation(inode, 380 if (inode) {
380 NULL) == 0) 381 int err;
381 goto wait_on_recovery; 382
383 err = nfs_async_inode_return_delegation(inode,
384 stateid);
385 if (err == 0)
386 goto wait_on_recovery;
387 if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
388 exception->retry = 1;
389 break;
390 }
391 }
382 if (state == NULL) 392 if (state == NULL)
383 break; 393 break;
384 ret = nfs4_schedule_stateid_recovery(server, state); 394 ret = nfs4_schedule_stateid_recovery(server, state);
@@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
427 case -NFS4ERR_DELAY: 437 case -NFS4ERR_DELAY:
428 nfs_inc_server_stats(server, NFSIOS_DELAY); 438 nfs_inc_server_stats(server, NFSIOS_DELAY);
429 case -NFS4ERR_GRACE: 439 case -NFS4ERR_GRACE:
440 case -NFS4ERR_LAYOUTTRYLATER:
430 case -NFS4ERR_RECALLCONFLICT: 441 case -NFS4ERR_RECALLCONFLICT:
431 exception->delay = 1; 442 exception->delay = 1;
432 return 0; 443 return 0;
@@ -2669,28 +2680,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
2669 return res; 2680 return res;
2670} 2681}
2671 2682
2672static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2683static int _nfs4_do_setattr(struct inode *inode,
2673 struct nfs_fattr *fattr, struct iattr *sattr, 2684 struct nfs_setattrargs *arg,
2674 struct nfs4_state *state, struct nfs4_label *ilabel, 2685 struct nfs_setattrres *res,
2675 struct nfs4_label *olabel) 2686 struct rpc_cred *cred,
2687 struct nfs4_state *state)
2676{ 2688{
2677 struct nfs_server *server = NFS_SERVER(inode); 2689 struct nfs_server *server = NFS_SERVER(inode);
2678 struct nfs_setattrargs arg = {
2679 .fh = NFS_FH(inode),
2680 .iap = sattr,
2681 .server = server,
2682 .bitmask = server->attr_bitmask,
2683 .label = ilabel,
2684 };
2685 struct nfs_setattrres res = {
2686 .fattr = fattr,
2687 .label = olabel,
2688 .server = server,
2689 };
2690 struct rpc_message msg = { 2690 struct rpc_message msg = {
2691 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], 2691 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
2692 .rpc_argp = &arg, 2692 .rpc_argp = arg,
2693 .rpc_resp = &res, 2693 .rpc_resp = res,
2694 .rpc_cred = cred, 2694 .rpc_cred = cred,
2695 }; 2695 };
2696 struct rpc_cred *delegation_cred = NULL; 2696 struct rpc_cred *delegation_cred = NULL;
@@ -2699,17 +2699,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2699 bool truncate; 2699 bool truncate;
2700 int status; 2700 int status;
2701 2701
2702 arg.bitmask = nfs4_bitmask(server, ilabel); 2702 nfs_fattr_init(res->fattr);
2703 if (ilabel)
2704 arg.bitmask = nfs4_bitmask(server, olabel);
2705
2706 nfs_fattr_init(fattr);
2707 2703
2708 /* Servers should only apply open mode checks for file size changes */ 2704 /* Servers should only apply open mode checks for file size changes */
2709 truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; 2705 truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
2710 fmode = truncate ? FMODE_WRITE : FMODE_READ; 2706 fmode = truncate ? FMODE_WRITE : FMODE_READ;
2711 2707
2712 if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { 2708 if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
2713 /* Use that stateid */ 2709 /* Use that stateid */
2714 } else if (truncate && state != NULL) { 2710 } else if (truncate && state != NULL) {
2715 struct nfs_lockowner lockowner = { 2711 struct nfs_lockowner lockowner = {
@@ -2719,19 +2715,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2719 if (!nfs4_valid_open_stateid(state)) 2715 if (!nfs4_valid_open_stateid(state))
2720 return -EBADF; 2716 return -EBADF;
2721 if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, 2717 if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
2722 &arg.stateid, &delegation_cred) == -EIO) 2718 &arg->stateid, &delegation_cred) == -EIO)
2723 return -EBADF; 2719 return -EBADF;
2724 } else 2720 } else
2725 nfs4_stateid_copy(&arg.stateid, &zero_stateid); 2721 nfs4_stateid_copy(&arg->stateid, &zero_stateid);
2726 if (delegation_cred) 2722 if (delegation_cred)
2727 msg.rpc_cred = delegation_cred; 2723 msg.rpc_cred = delegation_cred;
2728 2724
2729 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 2725 status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
2730 2726
2731 put_rpccred(delegation_cred); 2727 put_rpccred(delegation_cred);
2732 if (status == 0 && state != NULL) 2728 if (status == 0 && state != NULL)
2733 renew_lease(server, timestamp); 2729 renew_lease(server, timestamp);
2734 trace_nfs4_setattr(inode, &arg.stateid, status); 2730 trace_nfs4_setattr(inode, &arg->stateid, status);
2735 return status; 2731 return status;
2736} 2732}
2737 2733
@@ -2741,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2741 struct nfs4_label *olabel) 2737 struct nfs4_label *olabel)
2742{ 2738{
2743 struct nfs_server *server = NFS_SERVER(inode); 2739 struct nfs_server *server = NFS_SERVER(inode);
2740 struct nfs_setattrargs arg = {
2741 .fh = NFS_FH(inode),
2742 .iap = sattr,
2743 .server = server,
2744 .bitmask = server->attr_bitmask,
2745 .label = ilabel,
2746 };
2747 struct nfs_setattrres res = {
2748 .fattr = fattr,
2749 .label = olabel,
2750 .server = server,
2751 };
2744 struct nfs4_exception exception = { 2752 struct nfs4_exception exception = {
2745 .state = state, 2753 .state = state,
2746 .inode = inode, 2754 .inode = inode,
2755 .stateid = &arg.stateid,
2747 }; 2756 };
2748 int err; 2757 int err;
2758
2759 arg.bitmask = nfs4_bitmask(server, ilabel);
2760 if (ilabel)
2761 arg.bitmask = nfs4_bitmask(server, olabel);
2762
2749 do { 2763 do {
2750 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); 2764 err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
2751 switch (err) { 2765 switch (err) {
2752 case -NFS4ERR_OPENMODE: 2766 case -NFS4ERR_OPENMODE:
2753 if (!(sattr->ia_valid & ATTR_SIZE)) { 2767 if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
3267 return status; 3281 return status;
3268} 3282}
3269 3283
3270static int nfs4_do_find_root_sec(struct nfs_server *server,
3271 struct nfs_fh *fhandle, struct nfs_fsinfo *info)
3272{
3273 int mv = server->nfs_client->cl_minorversion;
3274 return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
3275}
3276
3277/** 3284/**
3278 * nfs4_proc_get_rootfh - get file handle for server's pseudoroot 3285 * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
3279 * @server: initialized nfs_server handle 3286 * @server: initialized nfs_server handle
@@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
3293 status = nfs4_lookup_root(server, fhandle, info); 3300 status = nfs4_lookup_root(server, fhandle, info);
3294 3301
3295 if (auth_probe || status == NFS4ERR_WRONGSEC) 3302 if (auth_probe || status == NFS4ERR_WRONGSEC)
3296 status = nfs4_do_find_root_sec(server, fhandle, info); 3303 status = server->nfs_client->cl_mvops->find_root_sec(server,
3304 fhandle, info);
3297 3305
3298 if (status == 0) 3306 if (status == 0)
3299 status = nfs4_server_capabilities(server, fhandle); 3307 status = nfs4_server_capabilities(server, fhandle);
@@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
4392 struct rpc_message *msg) 4400 struct rpc_message *msg)
4393{ 4401{
4394 hdr->timestamp = jiffies; 4402 hdr->timestamp = jiffies;
4395 hdr->pgio_done_cb = nfs4_read_done_cb; 4403 if (!hdr->pgio_done_cb)
4404 hdr->pgio_done_cb = nfs4_read_done_cb;
4396 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 4405 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
4397 nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); 4406 nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
4398} 4407}
@@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
7869 struct inode *inode = lgp->args.inode; 7878 struct inode *inode = lgp->args.inode;
7870 struct nfs_server *server = NFS_SERVER(inode); 7879 struct nfs_server *server = NFS_SERVER(inode);
7871 struct pnfs_layout_hdr *lo; 7880 struct pnfs_layout_hdr *lo;
7872 int status = task->tk_status; 7881 int nfs4err = task->tk_status;
7882 int err, status = 0;
7883 LIST_HEAD(head);
7873 7884
7874 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); 7885 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
7875 7886
7876 switch (status) { 7887 switch (nfs4err) {
7877 case 0: 7888 case 0:
7878 goto out; 7889 goto out;
7879 7890
@@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
7905 status = -EOVERFLOW; 7916 status = -EOVERFLOW;
7906 goto out; 7917 goto out;
7907 } 7918 }
7908 /* Fallthrough */ 7919 status = -EBUSY;
7920 break;
7909 case -NFS4ERR_RECALLCONFLICT: 7921 case -NFS4ERR_RECALLCONFLICT:
7910 nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
7911 exception);
7912 status = -ERECALLCONFLICT; 7922 status = -ERECALLCONFLICT;
7913 goto out; 7923 break;
7914 case -NFS4ERR_EXPIRED: 7924 case -NFS4ERR_EXPIRED:
7915 case -NFS4ERR_BAD_STATEID: 7925 case -NFS4ERR_BAD_STATEID:
7916 exception->timeout = 0; 7926 exception->timeout = 0;
7917 spin_lock(&inode->i_lock); 7927 spin_lock(&inode->i_lock);
7918 if (nfs4_stateid_match(&lgp->args.stateid, 7928 lo = NFS_I(inode)->layout;
7929 /* If the open stateid was bad, then recover it. */
7930 if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
7931 nfs4_stateid_match_other(&lgp->args.stateid,
7919 &lgp->args.ctx->state->stateid)) { 7932 &lgp->args.ctx->state->stateid)) {
7920 spin_unlock(&inode->i_lock); 7933 spin_unlock(&inode->i_lock);
7921 /* If the open stateid was bad, then recover it. */
7922 exception->state = lgp->args.ctx->state; 7934 exception->state = lgp->args.ctx->state;
7923 break; 7935 break;
7924 } 7936 }
7925 lo = NFS_I(inode)->layout;
7926 if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) &&
7927 nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
7928 LIST_HEAD(head);
7929
7930 /*
7931 * Mark the bad layout state as invalid, then retry
7932 * with the current stateid.
7933 */
7934 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
7935 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
7936 spin_unlock(&inode->i_lock);
7937 pnfs_free_lseg_list(&head);
7938 status = -EAGAIN;
7939 goto out;
7940 } else
7941 spin_unlock(&inode->i_lock);
7942 }
7943 7937
7944 status = nfs4_handle_exception(server, status, exception); 7938 /*
7945 if (exception->retry) 7939 * Mark the bad layout state as invalid, then retry
7940 */
7941 pnfs_mark_layout_stateid_invalid(lo, &head);
7942 spin_unlock(&inode->i_lock);
7943 pnfs_free_lseg_list(&head);
7946 status = -EAGAIN; 7944 status = -EAGAIN;
7945 goto out;
7946 }
7947
7948 err = nfs4_handle_exception(server, nfs4err, exception);
7949 if (!status) {
7950 if (exception->retry)
7951 status = -EAGAIN;
7952 else
7953 status = err;
7954 }
7947out: 7955out:
7948 dprintk("<-- %s\n", __func__); 7956 dprintk("<-- %s\n", __func__);
7949 return status; 7957 return status;
@@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata)
8129 spin_lock(&lo->plh_inode->i_lock); 8137 spin_lock(&lo->plh_inode->i_lock);
8130 pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, 8138 pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
8131 be32_to_cpu(lrp->args.stateid.seqid)); 8139 be32_to_cpu(lrp->args.stateid.seqid));
8132 pnfs_mark_layout_returned_if_empty(lo); 8140 if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
8133 if (lrp->res.lrs_present)
8134 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); 8141 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
8135 pnfs_clear_layoutreturn_waitbit(lo); 8142 pnfs_clear_layoutreturn_waitbit(lo);
8136 spin_unlock(&lo->plh_inode->i_lock); 8143 spin_unlock(&lo->plh_inode->i_lock);
@@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
8835#endif 8842#endif
8836}; 8843};
8837 8844
8838ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) 8845static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
8839{ 8846{
8840 ssize_t error, error2; 8847 ssize_t error, error2;
8841 8848
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 661e753fe1c9..7bd3a5c09d31 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
1985 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ 1985 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
1986 *p = cpu_to_be32(0); /* reclaim */ 1986 *p = cpu_to_be32(0); /* reclaim */
1987 encode_nfs4_stateid(xdr, &args->stateid); 1987 encode_nfs4_stateid(xdr, &args->stateid);
1988 p = reserve_space(xdr, 20); 1988 if (args->lastbytewritten != U64_MAX) {
1989 *p++ = cpu_to_be32(1); /* newoffset = TRUE */ 1989 p = reserve_space(xdr, 20);
1990 p = xdr_encode_hyper(p, args->lastbytewritten); 1990 *p++ = cpu_to_be32(1); /* newoffset = TRUE */
1991 p = xdr_encode_hyper(p, args->lastbytewritten);
1992 } else {
1993 p = reserve_space(xdr, 12);
1994 *p++ = cpu_to_be32(0); /* newoffset = FALSE */
1995 }
1991 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1996 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
1992 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1997 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
1993 1998
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 31c7763b94d5..2ca9167bc97d 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -37,7 +37,6 @@
37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ 37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
38 { 1 << NFS_INO_STALE, "STALE" }, \ 38 { 1 << NFS_INO_STALE, "STALE" }, \
39 { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ 39 { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
40 { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
41 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ 40 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
42 { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ 41 { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
43 { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) 42 { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0fbe734cc38c..70806cae0d36 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
259 * is required. 259 * is required.
260 * Note that caller must hold inode->i_lock. 260 * Note that caller must hold inode->i_lock.
261 */ 261 */
262static int 262int
263pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, 263pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
264 struct list_head *lseg_list) 264 struct list_head *lseg_list)
265{ 265{
@@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
334} 334}
335 335
336static void 336static void
337init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 337pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
338 const struct pnfs_layout_range *range,
339 const nfs4_stateid *stateid)
338{ 340{
339 INIT_LIST_HEAD(&lseg->pls_list); 341 INIT_LIST_HEAD(&lseg->pls_list);
340 INIT_LIST_HEAD(&lseg->pls_lc_list); 342 INIT_LIST_HEAD(&lseg->pls_lc_list);
341 atomic_set(&lseg->pls_refcount, 1); 343 atomic_set(&lseg->pls_refcount, 1);
342 smp_mb();
343 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 344 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
344 lseg->pls_layout = lo; 345 lseg->pls_layout = lo;
346 lseg->pls_range = *range;
347 lseg->pls_seq = be32_to_cpu(stateid->seqid);
345} 348}
346 349
347static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) 350static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
@@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
486 (end2 == NFS4_MAX_UINT64 || end2 > start1); 489 (end2 == NFS4_MAX_UINT64 || end2 > start1);
487} 490}
488 491
489static bool
490should_free_lseg(const struct pnfs_layout_range *lseg_range,
491 const struct pnfs_layout_range *recall_range)
492{
493 return (recall_range->iomode == IOMODE_ANY ||
494 lseg_range->iomode == recall_range->iomode) &&
495 pnfs_lseg_range_intersecting(lseg_range, recall_range);
496}
497
498static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 492static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
499 struct list_head *tmp_list) 493 struct list_head *tmp_list)
500{ 494{
@@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
533 return (s32)(s1 - s2) > 0; 527 return (s32)(s1 - s2) > 0;
534} 528}
535 529
530static bool
531pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
532 const struct pnfs_layout_range *recall_range)
533{
534 return (recall_range->iomode == IOMODE_ANY ||
535 lseg_range->iomode == recall_range->iomode) &&
536 pnfs_lseg_range_intersecting(lseg_range, recall_range);
537}
538
539static bool
540pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
541 const struct pnfs_layout_range *recall_range,
542 u32 seq)
543{
544 if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
545 return false;
546 if (recall_range == NULL)
547 return true;
548 return pnfs_should_free_range(&lseg->pls_range, recall_range);
549}
550
536/** 551/**
537 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later 552 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
538 * @lo: layout header containing the lsegs 553 * @lo: layout header containing the lsegs
@@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
562 if (list_empty(&lo->plh_segs)) 577 if (list_empty(&lo->plh_segs))
563 return 0; 578 return 0;
564 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 579 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
565 if (!recall_range || 580 if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
566 should_free_lseg(&lseg->pls_range, recall_range)) {
567 if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
568 continue;
569 dprintk("%s: freeing lseg %p iomode %d seq %u" 581 dprintk("%s: freeing lseg %p iomode %d seq %u"
570 "offset %llu length %llu\n", __func__, 582 "offset %llu length %llu\n", __func__,
571 lseg, lseg->pls_range.iomode, lseg->pls_seq, 583 lseg, lseg->pls_range.iomode, lseg->pls_seq,
@@ -761,24 +773,25 @@ void
761pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 773pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
762 bool update_barrier) 774 bool update_barrier)
763{ 775{
764 u32 oldseq, newseq, new_barrier; 776 u32 oldseq, newseq, new_barrier = 0;
765 int empty = list_empty(&lo->plh_segs); 777 bool invalid = !pnfs_layout_is_valid(lo);
766 778
767 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 779 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
768 newseq = be32_to_cpu(new->seqid); 780 newseq = be32_to_cpu(new->seqid);
769 if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { 781 if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
770 nfs4_stateid_copy(&lo->plh_stateid, new); 782 nfs4_stateid_copy(&lo->plh_stateid, new);
771 if (update_barrier) { 783 /*
772 new_barrier = be32_to_cpu(new->seqid); 784 * Because of wraparound, we want to keep the barrier
773 } else { 785 * "close" to the current seqids.
774 /* Because of wraparound, we want to keep the barrier 786 */
775 * "close" to the current seqids. 787 new_barrier = newseq - atomic_read(&lo->plh_outstanding);
776 */
777 new_barrier = newseq - atomic_read(&lo->plh_outstanding);
778 }
779 if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
780 lo->plh_barrier = new_barrier;
781 } 788 }
789 if (update_barrier)
790 new_barrier = be32_to_cpu(new->seqid);
791 else if (new_barrier == 0)
792 return;
793 if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
794 lo->plh_barrier = new_barrier;
782} 795}
783 796
784static bool 797static bool
@@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
873 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); 886 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
874} 887}
875 888
889static void
890pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
891{
892 lo->plh_return_iomode = 0;
893 lo->plh_return_seq = 0;
894 clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
895}
896
876static bool 897static bool
877pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) 898pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
899 nfs4_stateid *stateid,
900 enum pnfs_iomode *iomode)
878{ 901{
879 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) 902 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
880 return false; 903 return false;
881 lo->plh_return_iomode = 0;
882 lo->plh_return_seq = 0;
883 pnfs_get_layout_hdr(lo); 904 pnfs_get_layout_hdr(lo);
884 clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 905 if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
906 if (stateid != NULL) {
907 nfs4_stateid_copy(stateid, &lo->plh_stateid);
908 if (lo->plh_return_seq != 0)
909 stateid->seqid = cpu_to_be32(lo->plh_return_seq);
910 }
911 if (iomode != NULL)
912 *iomode = lo->plh_return_iomode;
913 pnfs_clear_layoutreturn_info(lo);
914 return true;
915 }
916 if (stateid != NULL)
917 nfs4_stateid_copy(stateid, &lo->plh_stateid);
918 if (iomode != NULL)
919 *iomode = IOMODE_ANY;
885 return true; 920 return true;
886} 921}
887 922
@@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
949 enum pnfs_iomode iomode; 984 enum pnfs_iomode iomode;
950 bool send; 985 bool send;
951 986
952 nfs4_stateid_copy(&stateid, &lo->plh_stateid); 987 send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
953 stateid.seqid = cpu_to_be32(lo->plh_return_seq);
954 iomode = lo->plh_return_iomode;
955 send = pnfs_prepare_layoutreturn(lo);
956 spin_unlock(&inode->i_lock); 988 spin_unlock(&inode->i_lock);
957 if (send) { 989 if (send) {
958 /* Send an async layoutreturn so we dont deadlock */ 990 /* Send an async layoutreturn so we dont deadlock */
@@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino)
989 dprintk("NFS: %s no layout to return\n", __func__); 1021 dprintk("NFS: %s no layout to return\n", __func__);
990 goto out; 1022 goto out;
991 } 1023 }
992 nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
993 /* Reference matched in nfs4_layoutreturn_release */ 1024 /* Reference matched in nfs4_layoutreturn_release */
994 pnfs_get_layout_hdr(lo); 1025 pnfs_get_layout_hdr(lo);
995 empty = list_empty(&lo->plh_segs); 1026 empty = list_empty(&lo->plh_segs);
@@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino)
1012 goto out_put_layout_hdr; 1043 goto out_put_layout_hdr;
1013 } 1044 }
1014 1045
1015 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 1046 send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
1016 send = pnfs_prepare_layoutreturn(lo);
1017 spin_unlock(&ino->i_lock); 1047 spin_unlock(&ino->i_lock);
1018 pnfs_free_lseg_list(&tmp_list); 1048 pnfs_free_lseg_list(&tmp_list);
1019 if (send) 1049 if (send)
@@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino)
1080 goto out_noroc; 1110 goto out_noroc;
1081 } 1111 }
1082 1112
1083 nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1084 /* always send layoutreturn if being marked so */ 1113 /* always send layoutreturn if being marked so */
1085 if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, 1114 if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1086 &lo->plh_flags)) 1115 layoutreturn = pnfs_prepare_layoutreturn(lo,
1087 layoutreturn = pnfs_prepare_layoutreturn(lo); 1116 &stateid, NULL);
1088 1117
1089 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1118 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
1090 /* If we are sending layoutreturn, invalidate all valid lsegs */ 1119 /* If we are sending layoutreturn, invalidate all valid lsegs */
@@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
1132 1161
1133 spin_lock(&ino->i_lock); 1162 spin_lock(&ino->i_lock);
1134 lo = NFS_I(ino)->layout; 1163 lo = NFS_I(ino)->layout;
1135 pnfs_mark_layout_returned_if_empty(lo);
1136 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) 1164 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
1137 lo->plh_barrier = barrier; 1165 lo->plh_barrier = barrier;
1138 spin_unlock(&ino->i_lock); 1166 spin_unlock(&ino->i_lock);
@@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino,
1505 struct pnfs_layout_segment *lseg = NULL; 1533 struct pnfs_layout_segment *lseg = NULL;
1506 nfs4_stateid stateid; 1534 nfs4_stateid stateid;
1507 long timeout = 0; 1535 long timeout = 0;
1508 unsigned long giveup = jiffies + rpc_get_timeout(server->client); 1536 unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
1509 bool first; 1537 bool first;
1510 1538
1511 if (!pnfs_enabled_sb(NFS_SERVER(ino))) { 1539 if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
@@ -1645,33 +1673,44 @@ lookup_again:
1645 lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); 1673 lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
1646 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 1674 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1647 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); 1675 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1676 atomic_dec(&lo->plh_outstanding);
1648 if (IS_ERR(lseg)) { 1677 if (IS_ERR(lseg)) {
1649 switch(PTR_ERR(lseg)) { 1678 switch(PTR_ERR(lseg)) {
1650 case -ERECALLCONFLICT: 1679 case -EBUSY:
1651 if (time_after(jiffies, giveup)) 1680 if (time_after(jiffies, giveup))
1652 lseg = NULL; 1681 lseg = NULL;
1653 /* Fallthrough */ 1682 break;
1654 case -EAGAIN: 1683 case -ERECALLCONFLICT:
1655 pnfs_put_layout_hdr(lo); 1684 /* Huh? We hold no layouts, how is there a recall? */
1656 if (first) 1685 if (first) {
1657 pnfs_clear_first_layoutget(lo); 1686 lseg = NULL;
1658 if (lseg) { 1687 break;
1659 trace_pnfs_update_layout(ino, pos, count,
1660 iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
1661 goto lookup_again;
1662 } 1688 }
1689 /* Destroy the existing layout and start over */
1690 if (time_after(jiffies, giveup))
1691 pnfs_destroy_layout(NFS_I(ino));
1663 /* Fallthrough */ 1692 /* Fallthrough */
1693 case -EAGAIN:
1694 break;
1664 default: 1695 default:
1665 if (!nfs_error_is_fatal(PTR_ERR(lseg))) { 1696 if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
1666 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 1697 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1667 lseg = NULL; 1698 lseg = NULL;
1668 } 1699 }
1700 goto out_put_layout_hdr;
1701 }
1702 if (lseg) {
1703 if (first)
1704 pnfs_clear_first_layoutget(lo);
1705 trace_pnfs_update_layout(ino, pos, count,
1706 iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
1707 pnfs_put_layout_hdr(lo);
1708 goto lookup_again;
1669 } 1709 }
1670 } else { 1710 } else {
1671 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 1711 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1672 } 1712 }
1673 1713
1674 atomic_dec(&lo->plh_outstanding);
1675out_put_layout_hdr: 1714out_put_layout_hdr:
1676 if (first) 1715 if (first)
1677 pnfs_clear_first_layoutget(lo); 1716 pnfs_clear_first_layoutget(lo);
@@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1735 return lseg; 1774 return lseg;
1736 } 1775 }
1737 1776
1738 init_lseg(lo, lseg); 1777 pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
1739 lseg->pls_range = res->range;
1740 lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
1741 1778
1742 spin_lock(&ino->i_lock); 1779 spin_lock(&ino->i_lock);
1743 if (pnfs_layoutgets_blocked(lo)) { 1780 if (pnfs_layoutgets_blocked(lo)) {
@@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1758 * inode invalid, and don't bother validating the stateid 1795 * inode invalid, and don't bother validating the stateid
1759 * sequence number. 1796 * sequence number.
1760 */ 1797 */
1761 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); 1798 pnfs_mark_layout_stateid_invalid(lo, &free_me);
1762 1799
1763 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); 1800 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1764 lo->plh_barrier = be32_to_cpu(res->stateid.seqid); 1801 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1765 } 1802 }
1766 1803
1767 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1768
1769 pnfs_get_lseg(lseg); 1804 pnfs_get_lseg(lseg);
1770 pnfs_layout_insert_lseg(lo, lseg, &free_me); 1805 pnfs_layout_insert_lseg(lo, lseg, &free_me);
1806 if (!pnfs_layout_is_valid(lo)) {
1807 pnfs_clear_layoutreturn_info(lo);
1808 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1809 }
1810
1771 1811
1772 if (res->return_on_close) 1812 if (res->return_on_close)
1773 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 1813 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
@@ -1787,14 +1827,14 @@ static void
1787pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, 1827pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
1788 u32 seq) 1828 u32 seq)
1789{ 1829{
1790 if (lo->plh_return_iomode == iomode) 1830 if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
1791 return;
1792 if (lo->plh_return_iomode != 0)
1793 iomode = IOMODE_ANY; 1831 iomode = IOMODE_ANY;
1794 lo->plh_return_iomode = iomode; 1832 lo->plh_return_iomode = iomode;
1795 set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 1833 set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
1796 if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) 1834 if (seq != 0) {
1835 WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
1797 lo->plh_return_seq = seq; 1836 lo->plh_return_seq = seq;
1837 }
1798} 1838}
1799 1839
1800/** 1840/**
@@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1824 assert_spin_locked(&lo->plh_inode->i_lock); 1864 assert_spin_locked(&lo->plh_inode->i_lock);
1825 1865
1826 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 1866 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1827 if (should_free_lseg(&lseg->pls_range, return_range)) { 1867 if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
1828 dprintk("%s: marking lseg %p iomode %d " 1868 dprintk("%s: marking lseg %p iomode %d "
1829 "offset %llu length %llu\n", __func__, 1869 "offset %llu length %llu\n", __func__,
1830 lseg, lseg->pls_range.iomode, 1870 lseg, lseg->pls_range.iomode,
@@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
1855 bool return_now = false; 1895 bool return_now = false;
1856 1896
1857 spin_lock(&inode->i_lock); 1897 spin_lock(&inode->i_lock);
1858 pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); 1898 pnfs_set_plh_return_info(lo, range.iomode, 0);
1859 /* 1899 /*
1860 * mark all matching lsegs so that we are sure to have no live 1900 * mark all matching lsegs so that we are sure to have no live
1861 * segments at hand when sending layoutreturn. See pnfs_put_lseg() 1901 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1862 * for how it works. 1902 * for how it works.
1863 */ 1903 */
1864 if (!pnfs_mark_matching_lsegs_return(lo, &free_me, 1904 if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
1865 &range, lseg->pls_seq)) {
1866 nfs4_stateid stateid; 1905 nfs4_stateid stateid;
1867 enum pnfs_iomode iomode = lo->plh_return_iomode; 1906 enum pnfs_iomode iomode;
1868 1907
1869 nfs4_stateid_copy(&stateid, &lo->plh_stateid); 1908 return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
1870 return_now = pnfs_prepare_layoutreturn(lo);
1871 spin_unlock(&inode->i_lock); 1909 spin_unlock(&inode->i_lock);
1872 if (return_now) 1910 if (return_now)
1873 pnfs_send_layoutreturn(lo, &stateid, iomode, false); 1911 pnfs_send_layoutreturn(lo, &stateid, iomode, false);
@@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2382 nfs_fattr_init(&data->fattr); 2420 nfs_fattr_init(&data->fattr);
2383 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 2421 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
2384 data->res.fattr = &data->fattr; 2422 data->res.fattr = &data->fattr;
2385 data->args.lastbytewritten = end_pos - 1; 2423 if (end_pos != 0)
2424 data->args.lastbytewritten = end_pos - 1;
2425 else
2426 data->args.lastbytewritten = U64_MAX;
2386 data->res.server = NFS_SERVER(inode); 2427 data->res.server = NFS_SERVER(inode);
2387 2428
2388 if (ld->prepare_layoutcommit) { 2429 if (ld->prepare_layoutcommit) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index b21bd0bee784..31d99b2927b0 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
268 struct list_head *tmp_list, 268 struct list_head *tmp_list,
269 const struct pnfs_layout_range *recall_range, 269 const struct pnfs_layout_range *recall_range,
270 u32 seq); 270 u32 seq);
271int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
272 struct list_head *lseg_list);
271bool pnfs_roc(struct inode *ino); 273bool pnfs_roc(struct inode *ino);
272void pnfs_roc_release(struct inode *ino); 274void pnfs_roc_release(struct inode *ino);
273void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 275void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode)
375 return NFS_I(inode)->layout != NULL; 377 return NFS_I(inode)->layout != NULL;
376} 378}
377 379
380static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
381{
382 return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
383}
384
378static inline struct nfs4_deviceid_node * 385static inline struct nfs4_deviceid_node *
379nfs4_get_deviceid(struct nfs4_deviceid_node *d) 386nfs4_get_deviceid(struct nfs4_deviceid_node *d)
380{ 387{
@@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end)
545 return 1 + end - offset; 552 return 1 + end - offset;
546} 553}
547 554
548/**
549 * pnfs_mark_layout_returned_if_empty - marks the layout as returned
550 * @lo: layout header
551 *
552 * Note: Caller must hold inode->i_lock
553 */
554static inline void
555pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
556{
557 if (list_empty(&lo->plh_segs))
558 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
559}
560
561static inline void 555static inline void
562pnfs_copy_range(struct pnfs_layout_range *dst, 556pnfs_copy_range(struct pnfs_layout_range *dst,
563 const struct pnfs_layout_range *src) 557 const struct pnfs_layout_range *src)
@@ -629,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync)
629} 623}
630 624
631static inline bool 625static inline bool
626pnfs_layoutcommit_outstanding(struct inode *inode)
627{
628 return false;
629}
630
631
632static inline bool
632pnfs_roc(struct inode *ino) 633pnfs_roc(struct inode *ino)
633{ 634{
634 return false; 635 return false;
@@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
716 return false; 717 return false;
717} 718}
718 719
719static inline bool
720pnfs_layoutcommit_outstanding(struct inode *inode)
721{
722 return false;
723}
724
725
726static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 720static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
727{ 721{
728 return NULL; 722 return NULL;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index b38e3c0dc790..f3468b57a32a 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
595} 595}
596 596
597static struct nfs_client *(*get_v3_ds_connect)( 597static struct nfs_client *(*get_v3_ds_connect)(
598 struct nfs_client *mds_clp, 598 struct nfs_server *mds_srv,
599 const struct sockaddr *ds_addr, 599 const struct sockaddr *ds_addr,
600 int ds_addrlen, 600 int ds_addrlen,
601 int ds_proto, 601 int ds_proto,
@@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
654 rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, 654 rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
655 rpc_clnt_test_and_add_xprt, NULL); 655 rpc_clnt_test_and_add_xprt, NULL);
656 } else 656 } else
657 clp = get_v3_ds_connect(mds_srv->nfs_client, 657 clp = get_v3_ds_connect(mds_srv,
658 (struct sockaddr *)&da->da_addr, 658 (struct sockaddr *)&da->da_addr,
659 da->da_addrlen, IPPROTO_TCP, 659 da->da_addrlen, IPPROTO_TCP,
660 timeo, retrans, au_flavor); 660 timeo, retrans, au_flavor);
@@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
690 dprintk("%s: DS %s: trying address %s\n", 690 dprintk("%s: DS %s: trying address %s\n",
691 __func__, ds->ds_remotestr, da->da_remotestr); 691 __func__, ds->ds_remotestr, da->da_remotestr);
692 692
693 clp = nfs4_set_ds_client(mds_srv->nfs_client, 693 clp = nfs4_set_ds_client(mds_srv,
694 (struct sockaddr *)&da->da_addr, 694 (struct sockaddr *)&da->da_addr,
695 da->da_addrlen, IPPROTO_TCP, 695 da->da_addrlen, IPPROTO_TCP,
696 timeo, retrans, minor_version, 696 timeo, retrans, minor_version,
@@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
940int 940int
941pnfs_nfs_generic_sync(struct inode *inode, bool datasync) 941pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
942{ 942{
943 int ret;
944
945 if (!pnfs_layoutcommit_outstanding(inode))
946 return 0;
947 ret = nfs_commit_inode(inode, FLUSH_SYNC);
948 if (ret < 0)
949 return ret;
943 if (datasync) 950 if (datasync)
944 return 0; 951 return 0;
945 return pnfs_layoutcommit_inode(inode, true); 952 return pnfs_layoutcommit_inode(inode, true);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2137e0202f25..18d446e1a82b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
1684{ 1684{
1685 rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; 1685 rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
1686 unsigned int i; 1686 unsigned int i;
1687 int use_auth_null = false;
1687 1688
1688 /* 1689 /*
1689 * If the sec= mount option is used, the specified flavor or AUTH_NULL 1690 * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
1691 * 1692 *
1692 * AUTH_NULL has a special meaning when it's in the server list - it 1693 * AUTH_NULL has a special meaning when it's in the server list - it
1693 * means that the server will ignore the rpc creds, so any flavor 1694 * means that the server will ignore the rpc creds, so any flavor
1694 * can be used. 1695 * can be used but still use the sec= that was specified.
1695 */ 1696 */
1696 for (i = 0; i < count; i++) { 1697 for (i = 0; i < count; i++) {
1697 flavor = server_authlist[i]; 1698 flavor = server_authlist[i];
1698 1699
1699 if (nfs_auth_info_match(&args->auth_info, flavor) || 1700 if (nfs_auth_info_match(&args->auth_info, flavor))
1700 flavor == RPC_AUTH_NULL)
1701 goto out; 1701 goto out;
1702
1703 if (flavor == RPC_AUTH_NULL)
1704 use_auth_null = true;
1705 }
1706
1707 if (use_auth_null) {
1708 flavor = RPC_AUTH_NULL;
1709 goto out;
1702 } 1710 }
1703 1711
1704 dfprintk(MOUNT, 1712 dfprintk(MOUNT,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 593fa21a02c0..3a6724c6eb5f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page,
625 int err; 625 int err;
626 626
627 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 627 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
628 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), 628 nfs_pageio_init_write(&pgio, inode, 0,
629 false, &nfs_async_write_completion_ops); 629 false, &nfs_async_write_completion_ops);
630 err = nfs_do_writepage(page, wbc, &pgio, launder); 630 err = nfs_do_writepage(page, wbc, &pgio, launder);
631 nfs_pageio_complete(&pgio); 631 nfs_pageio_complete(&pgio);
@@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
657int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 657int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
658{ 658{
659 struct inode *inode = mapping->host; 659 struct inode *inode = mapping->host;
660 unsigned long *bitlock = &NFS_I(inode)->flags;
661 struct nfs_pageio_descriptor pgio; 660 struct nfs_pageio_descriptor pgio;
662 int err; 661 int err;
663 662
664 /* Stop dirtying of new pages while we sync */
665 err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
666 nfs_wait_bit_killable, TASK_KILLABLE);
667 if (err)
668 goto out_err;
669
670 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 663 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
671 664
672 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, 665 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
@@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
674 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 667 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
675 nfs_pageio_complete(&pgio); 668 nfs_pageio_complete(&pgio);
676 669
677 clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
678 smp_mb__after_atomic();
679 wake_up_bit(bitlock, NFS_INO_FLUSHING);
680
681 if (err < 0) 670 if (err < 0)
682 goto out_err; 671 goto out_err;
683 err = pgio.pg_error; 672 err = pgio.pg_error;
@@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
1195/* 1184/*
1196 * Test if the open context credential key is marked to expire soon. 1185 * Test if the open context credential key is marked to expire soon.
1197 */ 1186 */
1198bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) 1187bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
1199{ 1188{
1200 return rpcauth_cred_key_to_expire(ctx->cred); 1189 struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
1190
1191 return rpcauth_cred_key_to_expire(auth, ctx->cred);
1201} 1192}
1202 1193
1203/* 1194/*
@@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page,
1289 dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", 1280 dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n",
1290 file, count, (long long)(page_file_offset(page) + offset)); 1281 file, count, (long long)(page_file_offset(page) + offset));
1291 1282
1283 if (!count)
1284 goto out;
1285
1292 if (nfs_can_extend_write(file, page, inode)) { 1286 if (nfs_can_extend_write(file, page, inode)) {
1293 count = max(count + offset, nfs_page_length(page)); 1287 count = max(count + offset, nfs_page_length(page));
1294 offset = 0; 1288 offset = 0;
@@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page,
1299 nfs_set_pageerror(page); 1293 nfs_set_pageerror(page);
1300 else 1294 else
1301 __set_page_dirty_nobuffers(page); 1295 __set_page_dirty_nobuffers(page);
1302 1296out:
1303 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 1297 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
1304 status, (long long)i_size_read(inode)); 1298 status, (long long)i_size_read(inode));
1305 return status; 1299 return status;
@@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1800 1794
1801 /* Okay, COMMIT succeeded, apparently. Check the verifier 1795 /* Okay, COMMIT succeeded, apparently. Check the verifier
1802 * returned by the server against all stored verfs. */ 1796 * returned by the server against all stored verfs. */
1803 if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { 1797 if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
1804 /* We have a match */ 1798 /* We have a match */
1805 nfs_inode_remove_request(req); 1799 nfs_inode_remove_request(req);
1806 dprintk(" OK\n"); 1800 dprintk(" OK\n");
@@ -1924,6 +1918,24 @@ out_mark_dirty:
1924EXPORT_SYMBOL_GPL(nfs_write_inode); 1918EXPORT_SYMBOL_GPL(nfs_write_inode);
1925 1919
1926/* 1920/*
1921 * Wrapper for filemap_write_and_wait_range()
1922 *
1923 * Needed for pNFS in order to ensure data becomes visible to the
1924 * client.
1925 */
1926int nfs_filemap_write_and_wait_range(struct address_space *mapping,
1927 loff_t lstart, loff_t lend)
1928{
1929 int ret;
1930
1931 ret = filemap_write_and_wait_range(mapping, lstart, lend);
1932 if (ret == 0)
1933 ret = pnfs_sync_inode(mapping->host, true);
1934 return ret;
1935}
1936EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
1937
1938/*
1927 * flush the inode to disk. 1939 * flush the inode to disk.
1928 */ 1940 */
1929int nfs_wb_all(struct inode *inode) 1941int nfs_wb_all(struct inode *inode)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d71278c3c5bd..810124b33327 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -205,12 +205,12 @@ struct nfs_inode {
205#define NFS_INO_STALE (1) /* possible stale inode */ 205#define NFS_INO_STALE (1) /* possible stale inode */
206#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ 206#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
207#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ 207#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
208#define NFS_INO_FLUSHING (4) /* inode is flushing out data */
209#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ 208#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
210#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ 209#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */
211#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ 210#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
212#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ 211#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
213#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ 212#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
213#define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */
214 214
215static inline struct nfs_inode *NFS_I(const struct inode *inode) 215static inline struct nfs_inode *NFS_I(const struct inode *inode)
216{ 216{
@@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino
351extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); 351extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
352extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); 352extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
353extern int nfs_revalidate_mapping_rcu(struct inode *inode); 353extern int nfs_revalidate_mapping_rcu(struct inode *inode);
354extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping);
355extern int nfs_setattr(struct dentry *, struct iattr *); 354extern int nfs_setattr(struct dentry *, struct iattr *);
356extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); 355extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
357extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, 356extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c304a11b5b1a..82b81a1c2438 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1596,9 +1596,8 @@ struct nfs_rpc_ops {
1596 int (*have_delegation)(struct inode *, fmode_t); 1596 int (*have_delegation)(struct inode *, fmode_t);
1597 int (*return_delegation)(struct inode *); 1597 int (*return_delegation)(struct inode *);
1598 struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); 1598 struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *);
1599 struct nfs_client * 1599 struct nfs_client *(*init_client) (struct nfs_client *,
1600 (*init_client) (struct nfs_client *, const struct rpc_timeout *, 1600 const struct nfs_client_initdata *);
1601 const char *);
1602 void (*free_client) (struct nfs_client *); 1601 void (*free_client) (struct nfs_client *);
1603 struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); 1602 struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *);
1604 struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, 1603 struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *,
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 899791573a40..4ccf184e971f 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -37,7 +37,6 @@ struct rpcsec_gss_info;
37 37
38/* auth_cred ac_flags bits */ 38/* auth_cred ac_flags bits */
39enum { 39enum {
40 RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */
41 RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ 40 RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */
42 RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying 41 RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying
43 key will expire soon */ 42 key will expire soon */
@@ -82,6 +81,9 @@ struct rpc_cred {
82 81
83#define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 82#define RPCAUTH_CRED_MAGIC 0x0f4aa4f0
84 83
84/* rpc_auth au_flags */
85#define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */
86
85/* 87/*
86 * Client authentication handle 88 * Client authentication handle
87 */ 89 */
@@ -107,6 +109,9 @@ struct rpc_auth {
107 /* per-flavor data */ 109 /* per-flavor data */
108}; 110};
109 111
112/* rpc_auth au_flags */
113#define RPCAUTH_AUTH_DATATOUCH 0x00000002
114
110struct rpc_auth_create_args { 115struct rpc_auth_create_args {
111 rpc_authflavor_t pseudoflavor; 116 rpc_authflavor_t pseudoflavor;
112 const char *target_name; 117 const char *target_name;
@@ -196,7 +201,7 @@ void rpcauth_destroy_credcache(struct rpc_auth *);
196void rpcauth_clear_credcache(struct rpc_cred_cache *); 201void rpcauth_clear_credcache(struct rpc_cred_cache *);
197int rpcauth_key_timeout_notify(struct rpc_auth *, 202int rpcauth_key_timeout_notify(struct rpc_auth *,
198 struct rpc_cred *); 203 struct rpc_cred *);
199bool rpcauth_cred_key_to_expire(struct rpc_cred *); 204bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *);
200char * rpcauth_stringify_acceptor(struct rpc_cred *); 205char * rpcauth_stringify_acceptor(struct rpc_cred *);
201 206
202static inline 207static inline
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index 1f911ccb2a75..68ec78c1aa48 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -73,6 +73,7 @@ u32 gss_delete_sec_context(
73rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop, 73rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop,
74 u32 service); 74 u32 service);
75u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor); 75u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor);
76bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor);
76char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); 77char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service);
77 78
78struct pf_desc { 79struct pf_desc {
@@ -81,6 +82,7 @@ struct pf_desc {
81 u32 service; 82 u32 service;
82 char *name; 83 char *name;
83 char *auth_domain_name; 84 char *auth_domain_name;
85 bool datatouch;
84}; 86};
85 87
86/* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and 88/* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 05a1809c44d9..817af0b4385e 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -230,6 +230,10 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *,
230 struct rpc_task *); 230 struct rpc_task *);
231void rpc_wake_up(struct rpc_wait_queue *); 231void rpc_wake_up(struct rpc_wait_queue *);
232struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); 232struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
233struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
234 struct rpc_wait_queue *,
235 bool (*)(struct rpc_task *, void *),
236 void *);
233struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, 237struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
234 bool (*)(struct rpc_task *, void *), 238 bool (*)(struct rpc_task *, void *),
235 void *); 239 void *);
@@ -247,6 +251,7 @@ void rpc_show_tasks(struct net *);
247int rpc_init_mempool(void); 251int rpc_init_mempool(void);
248void rpc_destroy_mempool(void); 252void rpc_destroy_mempool(void);
249extern struct workqueue_struct *rpciod_workqueue; 253extern struct workqueue_struct *rpciod_workqueue;
254extern struct workqueue_struct *xprtiod_workqueue;
250void rpc_prepare_task(struct rpc_task *task); 255void rpc_prepare_task(struct rpc_task *task);
251 256
252static inline int rpc_wait_for_completion_task(struct rpc_task *task) 257static inline int rpc_wait_for_completion_task(struct rpc_task *task)
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 0ece4ba06f06..bef3fb0abb8f 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -80,6 +80,7 @@ struct sock_xprt {
80#define TCP_RPC_REPLY (1UL << 6) 80#define TCP_RPC_REPLY (1UL << 6)
81 81
82#define XPRT_SOCK_CONNECTING 1U 82#define XPRT_SOCK_CONNECTING 1U
83#define XPRT_SOCK_DATA_READY (2)
83 84
84#endif /* __KERNEL__ */ 85#endif /* __KERNEL__ */
85 86
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 040ff627c18a..a7e42f9a405c 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
51 ret = kstrtoul(val, 0, &num); 51 ret = kstrtoul(val, 0, &num);
52 if (ret == -EINVAL) 52 if (ret == -EINVAL)
53 goto out_inval; 53 goto out_inval;
54 nbits = fls(num); 54 nbits = fls(num - 1);
55 if (num > (1U << nbits))
56 nbits++;
57 if (nbits > MAX_HASHTABLE_BITS || nbits < 2) 55 if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
58 goto out_inval; 56 goto out_inval;
59 *(unsigned int *)kp->arg = nbits; 57 *(unsigned int *)kp->arg = nbits;
@@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
359EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); 357EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
360 358
361bool 359bool
362rpcauth_cred_key_to_expire(struct rpc_cred *cred) 360rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
363{ 361{
362 if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
363 return false;
364 if (!cred->cr_ops->crkey_to_expire) 364 if (!cred->cr_ops->crkey_to_expire)
365 return false; 365 return false;
366 return cred->cr_ops->crkey_to_expire(cred); 366 return cred->cr_ops->crkey_to_expire(cred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 54dd3fdead54..168219535a34 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
224 224
225 225
226 /* Fast track for non crkey_timeout (no key) underlying credentials */ 226 /* Fast track for non crkey_timeout (no key) underlying credentials */
227 if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) 227 if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
228 return 0; 228 return 0;
229 229
230 /* Fast track for the normal case */ 230 /* Fast track for the normal case */
@@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
236 if (IS_ERR(tcred)) 236 if (IS_ERR(tcred))
237 return -EACCES; 237 return -EACCES;
238 238
239 if (!tcred->cr_ops->crkey_timeout) {
240 set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
241 ret = 0;
242 goto out_put;
243 }
244
245 /* Test for the almost error case */ 239 /* Test for the almost error case */
246 ret = tcred->cr_ops->crkey_timeout(tcred); 240 ret = tcred->cr_ops->crkey_timeout(tcred);
247 if (ret != 0) { 241 if (ret != 0) {
@@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
257 set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); 251 set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
258 } 252 }
259 253
260out_put:
261 put_rpccred(tcred); 254 put_rpccred(tcred);
262 return ret; 255 return ret;
263} 256}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index e64ae93d5b4f..23c8e7c39656 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
1015 auth = &gss_auth->rpc_auth; 1015 auth = &gss_auth->rpc_auth;
1016 auth->au_cslack = GSS_CRED_SLACK >> 2; 1016 auth->au_cslack = GSS_CRED_SLACK >> 2;
1017 auth->au_rslack = GSS_VERF_SLACK >> 2; 1017 auth->au_rslack = GSS_VERF_SLACK >> 2;
1018 auth->au_flags = 0;
1018 auth->au_ops = &authgss_ops; 1019 auth->au_ops = &authgss_ops;
1019 auth->au_flavor = flavor; 1020 auth->au_flavor = flavor;
1021 if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
1022 auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
1020 atomic_set(&auth->au_count, 1); 1023 atomic_set(&auth->au_count, 1);
1021 kref_init(&gss_auth->kref); 1024 kref_init(&gss_auth->kref);
1022 1025
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 65427492b1c9..60595835317a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
745 .qop = GSS_C_QOP_DEFAULT, 745 .qop = GSS_C_QOP_DEFAULT,
746 .service = RPC_GSS_SVC_INTEGRITY, 746 .service = RPC_GSS_SVC_INTEGRITY,
747 .name = "krb5i", 747 .name = "krb5i",
748 .datatouch = true,
748 }, 749 },
749 [2] = { 750 [2] = {
750 .pseudoflavor = RPC_AUTH_GSS_KRB5P, 751 .pseudoflavor = RPC_AUTH_GSS_KRB5P,
751 .qop = GSS_C_QOP_DEFAULT, 752 .qop = GSS_C_QOP_DEFAULT,
752 .service = RPC_GSS_SVC_PRIVACY, 753 .service = RPC_GSS_SVC_PRIVACY,
753 .name = "krb5p", 754 .name = "krb5p",
755 .datatouch = true,
754 }, 756 },
755}; 757};
756 758
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 7063d856a598..5fec3abbe19b 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
361} 361}
362EXPORT_SYMBOL(gss_pseudoflavor_to_service); 362EXPORT_SYMBOL(gss_pseudoflavor_to_service);
363 363
364bool
365gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
366{
367 int i;
368
369 for (i = 0; i < gm->gm_pf_num; i++) {
370 if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
371 return gm->gm_pfs[i].datatouch;
372 }
373 return false;
374}
375
364char * 376char *
365gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) 377gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
366{ 378{
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 8d9eb4d5ddd8..4d17376b2acb 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -115,6 +115,7 @@ static
115struct rpc_auth null_auth = { 115struct rpc_auth null_auth = {
116 .au_cslack = NUL_CALLSLACK, 116 .au_cslack = NUL_CALLSLACK,
117 .au_rslack = NUL_REPLYSLACK, 117 .au_rslack = NUL_REPLYSLACK,
118 .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
118 .au_ops = &authnull_ops, 119 .au_ops = &authnull_ops,
119 .au_flavor = RPC_AUTH_NULL, 120 .au_flavor = RPC_AUTH_NULL,
120 .au_count = ATOMIC_INIT(0), 121 .au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 9f65452b7cbc..a99278c984e8 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -228,6 +228,7 @@ static
228struct rpc_auth unix_auth = { 228struct rpc_auth unix_auth = {
229 .au_cslack = UNX_CALLSLACK, 229 .au_cslack = UNX_CALLSLACK,
230 .au_rslack = NUL_REPLYSLACK, 230 .au_rslack = NUL_REPLYSLACK,
231 .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
231 .au_ops = &authunix_ops, 232 .au_ops = &authunix_ops,
232 .au_flavor = RPC_AUTH_UNIX, 233 .au_flavor = RPC_AUTH_UNIX,
233 .au_count = ATOMIC_INIT(0), 234 .au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2808d550d273..cb49898a5a58 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
2577 kfree(data); 2577 kfree(data);
2578} 2578}
2579 2579
2580const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { 2580static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
2581 .rpc_call_done = rpc_cb_add_xprt_done, 2581 .rpc_call_done = rpc_cb_add_xprt_done,
2582 .rpc_release = rpc_cb_add_xprt_release, 2582 .rpc_release = rpc_cb_add_xprt_release,
2583}; 2583};
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index fcfd48d263f6..9ae588511aaf 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue;
54/* 54/*
55 * rpciod-related stuff 55 * rpciod-related stuff
56 */ 56 */
57struct workqueue_struct *rpciod_workqueue; 57struct workqueue_struct *rpciod_workqueue __read_mostly;
58struct workqueue_struct *xprtiod_workqueue __read_mostly;
58 59
59/* 60/*
60 * Disable the timer for a given RPC task. Should be called with 61 * Disable the timer for a given RPC task. Should be called with
@@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
329 * lockless RPC_IS_QUEUED() test) before we've had a chance to test 330 * lockless RPC_IS_QUEUED() test) before we've had a chance to test
330 * the RPC_TASK_RUNNING flag. 331 * the RPC_TASK_RUNNING flag.
331 */ 332 */
332static void rpc_make_runnable(struct rpc_task *task) 333static void rpc_make_runnable(struct workqueue_struct *wq,
334 struct rpc_task *task)
333{ 335{
334 bool need_wakeup = !rpc_test_and_set_running(task); 336 bool need_wakeup = !rpc_test_and_set_running(task);
335 337
@@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task)
338 return; 340 return;
339 if (RPC_IS_ASYNC(task)) { 341 if (RPC_IS_ASYNC(task)) {
340 INIT_WORK(&task->u.tk_work, rpc_async_schedule); 342 INIT_WORK(&task->u.tk_work, rpc_async_schedule);
341 queue_work(rpciod_workqueue, &task->u.tk_work); 343 queue_work(wq, &task->u.tk_work);
342 } else 344 } else
343 wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); 345 wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
344} 346}
@@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
407EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); 409EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
408 410
409/** 411/**
410 * __rpc_do_wake_up_task - wake up a single rpc_task 412 * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task
413 * @wq: workqueue on which to run task
411 * @queue: wait queue 414 * @queue: wait queue
412 * @task: task to be woken up 415 * @task: task to be woken up
413 * 416 *
414 * Caller must hold queue->lock, and have cleared the task queued flag. 417 * Caller must hold queue->lock, and have cleared the task queued flag.
415 */ 418 */
416static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) 419static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
420 struct rpc_wait_queue *queue,
421 struct rpc_task *task)
417{ 422{
418 dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", 423 dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
419 task->tk_pid, jiffies); 424 task->tk_pid, jiffies);
@@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
428 433
429 __rpc_remove_wait_queue(queue, task); 434 __rpc_remove_wait_queue(queue, task);
430 435
431 rpc_make_runnable(task); 436 rpc_make_runnable(wq, task);
432 437
433 dprintk("RPC: __rpc_wake_up_task done\n"); 438 dprintk("RPC: __rpc_wake_up_task done\n");
434} 439}
@@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
436/* 441/*
437 * Wake up a queued task while the queue lock is being held 442 * Wake up a queued task while the queue lock is being held
438 */ 443 */
439static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) 444static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
445 struct rpc_wait_queue *queue, struct rpc_task *task)
440{ 446{
441 if (RPC_IS_QUEUED(task)) { 447 if (RPC_IS_QUEUED(task)) {
442 smp_rmb(); 448 smp_rmb();
443 if (task->tk_waitqueue == queue) 449 if (task->tk_waitqueue == queue)
444 __rpc_do_wake_up_task(queue, task); 450 __rpc_do_wake_up_task_on_wq(wq, queue, task);
445 } 451 }
446} 452}
447 453
448/* 454/*
455 * Wake up a queued task while the queue lock is being held
456 */
457static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
458{
459 rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
460}
461
462/*
449 * Wake up a task on a specific queue 463 * Wake up a task on a specific queue
450 */ 464 */
451void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) 465void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
@@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
518/* 532/*
519 * Wake up the first task on the wait queue. 533 * Wake up the first task on the wait queue.
520 */ 534 */
521struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, 535struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
536 struct rpc_wait_queue *queue,
522 bool (*func)(struct rpc_task *, void *), void *data) 537 bool (*func)(struct rpc_task *, void *), void *data)
523{ 538{
524 struct rpc_task *task = NULL; 539 struct rpc_task *task = NULL;
@@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
529 task = __rpc_find_next_queued(queue); 544 task = __rpc_find_next_queued(queue);
530 if (task != NULL) { 545 if (task != NULL) {
531 if (func(task, data)) 546 if (func(task, data))
532 rpc_wake_up_task_queue_locked(queue, task); 547 rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
533 else 548 else
534 task = NULL; 549 task = NULL;
535 } 550 }
@@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
537 552
538 return task; 553 return task;
539} 554}
555
556/*
557 * Wake up the first task on the wait queue.
558 */
559struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
560 bool (*func)(struct rpc_task *, void *), void *data)
561{
562 return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data);
563}
540EXPORT_SYMBOL_GPL(rpc_wake_up_first); 564EXPORT_SYMBOL_GPL(rpc_wake_up_first);
541 565
542static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) 566static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
@@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task)
814 bool is_async = RPC_IS_ASYNC(task); 838 bool is_async = RPC_IS_ASYNC(task);
815 839
816 rpc_set_active(task); 840 rpc_set_active(task);
817 rpc_make_runnable(task); 841 rpc_make_runnable(rpciod_workqueue, task);
818 if (!is_async) 842 if (!is_async)
819 __rpc_execute(task); 843 __rpc_execute(task);
820} 844}
@@ -1071,10 +1095,22 @@ static int rpciod_start(void)
1071 * Create the rpciod thread and wait for it to start. 1095 * Create the rpciod thread and wait for it to start.
1072 */ 1096 */
1073 dprintk("RPC: creating workqueue rpciod\n"); 1097 dprintk("RPC: creating workqueue rpciod\n");
1074 /* Note: highpri because network receive is latency sensitive */ 1098 wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
1075 wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); 1099 if (!wq)
1100 goto out_failed;
1076 rpciod_workqueue = wq; 1101 rpciod_workqueue = wq;
1077 return rpciod_workqueue != NULL; 1102 /* Note: highpri because network receive is latency sensitive */
1103 wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1104 if (!wq)
1105 goto free_rpciod;
1106 xprtiod_workqueue = wq;
1107 return 1;
1108free_rpciod:
1109 wq = rpciod_workqueue;
1110 rpciod_workqueue = NULL;
1111 destroy_workqueue(wq);
1112out_failed:
1113 return 0;
1078} 1114}
1079 1115
1080static void rpciod_stop(void) 1116static void rpciod_stop(void)
@@ -1088,6 +1124,9 @@ static void rpciod_stop(void)
1088 wq = rpciod_workqueue; 1124 wq = rpciod_workqueue;
1089 rpciod_workqueue = NULL; 1125 rpciod_workqueue = NULL;
1090 destroy_workqueue(wq); 1126 destroy_workqueue(wq);
1127 wq = xprtiod_workqueue;
1128 xprtiod_workqueue = NULL;
1129 destroy_workqueue(wq);
1091} 1130}
1092 1131
1093void 1132void
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index cc9852897395..c5b0cb4f4056 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1188 *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 1188 *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
1189 1189
1190 /* Encode reply */ 1190 /* Encode reply */
1191 if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { 1191 if (*statp == rpc_drop_reply ||
1192 test_bit(RQ_DROPME, &rqstp->rq_flags)) {
1192 if (procp->pc_release) 1193 if (procp->pc_release)
1193 procp->pc_release(rqstp, NULL, rqstp->rq_resp); 1194 procp->pc_release(rqstp, NULL, rqstp->rq_resp);
1194 goto dropit; 1195 goto dropit;
1195 } 1196 }
1197 if (*statp == rpc_autherr_badcred) {
1198 if (procp->pc_release)
1199 procp->pc_release(rqstp, NULL, rqstp->rq_resp);
1200 goto err_bad_auth;
1201 }
1196 if (*statp == rpc_success && 1202 if (*statp == rpc_success &&
1197 (xdr = procp->pc_encode) && 1203 (xdr = procp->pc_encode) &&
1198 !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { 1204 !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 216a1385718a..8313960cac52 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
220 clear_bit(XPRT_LOCKED, &xprt->state); 220 clear_bit(XPRT_LOCKED, &xprt->state);
221 smp_mb__after_atomic(); 221 smp_mb__after_atomic();
222 } else 222 } else
223 queue_work(rpciod_workqueue, &xprt->task_cleanup); 223 queue_work(xprtiod_workqueue, &xprt->task_cleanup);
224} 224}
225 225
226/* 226/*
@@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
295 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) 295 if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
296 return; 296 return;
297 297
298 if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) 298 if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
299 __xprt_lock_write_func, xprt))
299 return; 300 return;
300 xprt_clear_locked(xprt); 301 xprt_clear_locked(xprt);
301} 302}
@@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
324 return; 325 return;
325 if (RPCXPRT_CONGESTED(xprt)) 326 if (RPCXPRT_CONGESTED(xprt))
326 goto out_unlock; 327 goto out_unlock;
327 if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) 328 if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
329 __xprt_lock_write_cong_func, xprt))
328 return; 330 return;
329out_unlock: 331out_unlock:
330 xprt_clear_locked(xprt); 332 xprt_clear_locked(xprt);
@@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
645 set_bit(XPRT_CLOSE_WAIT, &xprt->state); 647 set_bit(XPRT_CLOSE_WAIT, &xprt->state);
646 /* Try to schedule an autoclose RPC call */ 648 /* Try to schedule an autoclose RPC call */
647 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) 649 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
648 queue_work(rpciod_workqueue, &xprt->task_cleanup); 650 queue_work(xprtiod_workqueue, &xprt->task_cleanup);
649 xprt_wake_pending_tasks(xprt, -EAGAIN); 651 xprt_wake_pending_tasks(xprt, -EAGAIN);
650 spin_unlock_bh(&xprt->transport_lock); 652 spin_unlock_bh(&xprt->transport_lock);
651} 653}
@@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
672 set_bit(XPRT_CLOSE_WAIT, &xprt->state); 674 set_bit(XPRT_CLOSE_WAIT, &xprt->state);
673 /* Try to schedule an autoclose RPC call */ 675 /* Try to schedule an autoclose RPC call */
674 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) 676 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
675 queue_work(rpciod_workqueue, &xprt->task_cleanup); 677 queue_work(xprtiod_workqueue, &xprt->task_cleanup);
676 xprt_wake_pending_tasks(xprt, -EAGAIN); 678 xprt_wake_pending_tasks(xprt, -EAGAIN);
677out: 679out:
678 spin_unlock_bh(&xprt->transport_lock); 680 spin_unlock_bh(&xprt->transport_lock);
@@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data)
689 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) 691 if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
690 goto out_abort; 692 goto out_abort;
691 spin_unlock(&xprt->transport_lock); 693 spin_unlock(&xprt->transport_lock);
692 queue_work(rpciod_workqueue, &xprt->task_cleanup); 694 queue_work(xprtiod_workqueue, &xprt->task_cleanup);
693 return; 695 return;
694out_abort: 696out_abort:
695 spin_unlock(&xprt->transport_lock); 697 spin_unlock(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index e7fd76975d86..66c9d63f4797 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
271 xprt_switch_find_xprt_t find_next) 271 xprt_switch_find_xprt_t find_next)
272{ 272{
273 struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); 273 struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
274 struct list_head *head;
275 274
276 if (xps == NULL) 275 if (xps == NULL)
277 return NULL; 276 return NULL;
278 head = &xps->xps_xprt_list; 277 return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
279 if (xps->xps_nxprts < 2) 278 &xpi->xpi_cursor,
280 return xprt_switch_find_first_entry(head); 279 find_next);
281 return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next);
282} 280}
283 281
284static 282static
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index dc9f3b513a05..ef19fa42c50f 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
2 2
3rpcrdma-y := transport.o rpc_rdma.o verbs.o \ 3rpcrdma-y := transport.o rpc_rdma.o verbs.o \
4 fmr_ops.o frwr_ops.o physical_ops.o \ 4 fmr_ops.o frwr_ops.o \
5 svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ 5 svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
6 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ 6 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
7 module.o 7 module.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 6326ebe8b595..21cb3b150b37 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -19,13 +19,6 @@
19 * verb (fmr_op_unmap). 19 * verb (fmr_op_unmap).
20 */ 20 */
21 21
22/* Transport recovery
23 *
24 * After a transport reconnect, fmr_op_map re-uses the MR already
25 * allocated for the RPC, but generates a fresh rkey then maps the
26 * MR again. This process is synchronous.
27 */
28
29#include "xprt_rdma.h" 22#include "xprt_rdma.h"
30 23
31#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 24#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -35,62 +28,132 @@
35/* Maximum scatter/gather per FMR */ 28/* Maximum scatter/gather per FMR */
36#define RPCRDMA_MAX_FMR_SGES (64) 29#define RPCRDMA_MAX_FMR_SGES (64)
37 30
38static struct workqueue_struct *fmr_recovery_wq; 31/* Access mode of externally registered pages */
39 32enum {
40#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) 33 RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
34 IB_ACCESS_REMOTE_READ,
35};
41 36
42int 37bool
43fmr_alloc_recovery_wq(void) 38fmr_is_supported(struct rpcrdma_ia *ia)
44{ 39{
45 fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); 40 if (!ia->ri_device->alloc_fmr) {
46 return !fmr_recovery_wq ? -ENOMEM : 0; 41 pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
42 ia->ri_device->name);
43 return false;
44 }
45 return true;
47} 46}
48 47
49void 48static int
50fmr_destroy_recovery_wq(void) 49fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
51{ 50{
52 struct workqueue_struct *wq; 51 static struct ib_fmr_attr fmr_attr = {
52 .max_pages = RPCRDMA_MAX_FMR_SGES,
53 .max_maps = 1,
54 .page_shift = PAGE_SHIFT
55 };
53 56
54 if (!fmr_recovery_wq) 57 mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
55 return; 58 sizeof(u64), GFP_KERNEL);
59 if (!mw->fmr.fm_physaddrs)
60 goto out_free;
56 61
57 wq = fmr_recovery_wq; 62 mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
58 fmr_recovery_wq = NULL; 63 sizeof(*mw->mw_sg), GFP_KERNEL);
59 destroy_workqueue(wq); 64 if (!mw->mw_sg)
65 goto out_free;
66
67 sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
68
69 mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
70 &fmr_attr);
71 if (IS_ERR(mw->fmr.fm_mr))
72 goto out_fmr_err;
73
74 return 0;
75
76out_fmr_err:
77 dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
78 PTR_ERR(mw->fmr.fm_mr));
79
80out_free:
81 kfree(mw->mw_sg);
82 kfree(mw->fmr.fm_physaddrs);
83 return -ENOMEM;
60} 84}
61 85
62static int 86static int
63__fmr_unmap(struct rpcrdma_mw *mw) 87__fmr_unmap(struct rpcrdma_mw *mw)
64{ 88{
65 LIST_HEAD(l); 89 LIST_HEAD(l);
90 int rc;
66 91
67 list_add(&mw->fmr.fmr->list, &l); 92 list_add(&mw->fmr.fm_mr->list, &l);
68 return ib_unmap_fmr(&l); 93 rc = ib_unmap_fmr(&l);
94 list_del_init(&mw->fmr.fm_mr->list);
95 return rc;
69} 96}
70 97
71/* Deferred reset of a single FMR. Generate a fresh rkey by
72 * replacing the MR. There's no recovery if this fails.
73 */
74static void 98static void
75__fmr_recovery_worker(struct work_struct *work) 99fmr_op_release_mr(struct rpcrdma_mw *r)
76{ 100{
77 struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, 101 LIST_HEAD(unmap_list);
78 mw_work); 102 int rc;
79 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
80 103
81 __fmr_unmap(mw); 104 /* Ensure MW is not on any rl_registered list */
82 rpcrdma_put_mw(r_xprt, mw); 105 if (!list_empty(&r->mw_list))
83 return; 106 list_del(&r->mw_list);
107
108 kfree(r->fmr.fm_physaddrs);
109 kfree(r->mw_sg);
110
111 /* In case this one was left mapped, try to unmap it
112 * to prevent dealloc_fmr from failing with EBUSY
113 */
114 rc = __fmr_unmap(r);
115 if (rc)
116 pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
117 r, rc);
118
119 rc = ib_dealloc_fmr(r->fmr.fm_mr);
120 if (rc)
121 pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
122 r, rc);
123
124 kfree(r);
84} 125}
85 126
86/* A broken MR was discovered in a context that can't sleep. 127/* Reset of a single FMR.
87 * Defer recovery to the recovery worker.
88 */ 128 */
89static void 129static void
90__fmr_queue_recovery(struct rpcrdma_mw *mw) 130fmr_op_recover_mr(struct rpcrdma_mw *mw)
91{ 131{
92 INIT_WORK(&mw->mw_work, __fmr_recovery_worker); 132 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
93 queue_work(fmr_recovery_wq, &mw->mw_work); 133 int rc;
134
135 /* ORDER: invalidate first */
136 rc = __fmr_unmap(mw);
137
138 /* ORDER: then DMA unmap */
139 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
140 mw->mw_sg, mw->mw_nents, mw->mw_dir);
141 if (rc)
142 goto out_release;
143
144 rpcrdma_put_mw(r_xprt, mw);
145 r_xprt->rx_stats.mrs_recovered++;
146 return;
147
148out_release:
149 pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
150 r_xprt->rx_stats.mrs_orphaned++;
151
152 spin_lock(&r_xprt->rx_buf.rb_mwlock);
153 list_del(&mw->mw_all);
154 spin_unlock(&r_xprt->rx_buf.rb_mwlock);
155
156 fmr_op_release_mr(mw);
94} 157}
95 158
96static int 159static int
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
112 RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); 175 RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
113} 176}
114 177
115static int
116fmr_op_init(struct rpcrdma_xprt *r_xprt)
117{
118 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
119 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
120 struct ib_fmr_attr fmr_attr = {
121 .max_pages = RPCRDMA_MAX_FMR_SGES,
122 .max_maps = 1,
123 .page_shift = PAGE_SHIFT
124 };
125 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
126 struct rpcrdma_mw *r;
127 int i, rc;
128
129 spin_lock_init(&buf->rb_mwlock);
130 INIT_LIST_HEAD(&buf->rb_mws);
131 INIT_LIST_HEAD(&buf->rb_all);
132
133 i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
134 i += 2; /* head + tail */
135 i *= buf->rb_max_requests; /* one set for each RPC slot */
136 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
137
138 rc = -ENOMEM;
139 while (i--) {
140 r = kzalloc(sizeof(*r), GFP_KERNEL);
141 if (!r)
142 goto out;
143
144 r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
145 sizeof(u64), GFP_KERNEL);
146 if (!r->fmr.physaddrs)
147 goto out_free;
148
149 r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
150 if (IS_ERR(r->fmr.fmr))
151 goto out_fmr_err;
152
153 r->mw_xprt = r_xprt;
154 list_add(&r->mw_list, &buf->rb_mws);
155 list_add(&r->mw_all, &buf->rb_all);
156 }
157 return 0;
158
159out_fmr_err:
160 rc = PTR_ERR(r->fmr.fmr);
161 dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
162 kfree(r->fmr.physaddrs);
163out_free:
164 kfree(r);
165out:
166 return rc;
167}
168
169/* Use the ib_map_phys_fmr() verb to register a memory region 178/* Use the ib_map_phys_fmr() verb to register a memory region
170 * for remote access via RDMA READ or RDMA WRITE. 179 * for remote access via RDMA READ or RDMA WRITE.
171 */ 180 */
172static int 181static int
173fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 182fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
174 int nsegs, bool writing) 183 int nsegs, bool writing, struct rpcrdma_mw **out)
175{ 184{
176 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
177 struct ib_device *device = ia->ri_device;
178 enum dma_data_direction direction = rpcrdma_data_dir(writing);
179 struct rpcrdma_mr_seg *seg1 = seg; 185 struct rpcrdma_mr_seg *seg1 = seg;
180 int len, pageoff, i, rc; 186 int len, pageoff, i, rc;
181 struct rpcrdma_mw *mw; 187 struct rpcrdma_mw *mw;
188 u64 *dma_pages;
182 189
183 mw = seg1->rl_mw; 190 mw = rpcrdma_get_mw(r_xprt);
184 seg1->rl_mw = NULL; 191 if (!mw)
185 if (!mw) { 192 return -ENOBUFS;
186 mw = rpcrdma_get_mw(r_xprt);
187 if (!mw)
188 return -ENOMEM;
189 } else {
190 /* this is a retransmit; generate a fresh rkey */
191 rc = __fmr_unmap(mw);
192 if (rc)
193 return rc;
194 }
195 193
196 pageoff = offset_in_page(seg1->mr_offset); 194 pageoff = offset_in_page(seg1->mr_offset);
197 seg1->mr_offset -= pageoff; /* start of page */ 195 seg1->mr_offset -= pageoff; /* start of page */
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
200 if (nsegs > RPCRDMA_MAX_FMR_SGES) 198 if (nsegs > RPCRDMA_MAX_FMR_SGES)
201 nsegs = RPCRDMA_MAX_FMR_SGES; 199 nsegs = RPCRDMA_MAX_FMR_SGES;
202 for (i = 0; i < nsegs;) { 200 for (i = 0; i < nsegs;) {
203 rpcrdma_map_one(device, seg, direction); 201 if (seg->mr_page)
204 mw->fmr.physaddrs[i] = seg->mr_dma; 202 sg_set_page(&mw->mw_sg[i],
203 seg->mr_page,
204 seg->mr_len,
205 offset_in_page(seg->mr_offset));
206 else
207 sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
208 seg->mr_len);
205 len += seg->mr_len; 209 len += seg->mr_len;
206 ++seg; 210 ++seg;
207 ++i; 211 ++i;
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
210 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 214 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
211 break; 215 break;
212 } 216 }
213 217 mw->mw_nents = i;
214 rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, 218 mw->mw_dir = rpcrdma_data_dir(writing);
215 i, seg1->mr_dma); 219 if (i == 0)
220 goto out_dmamap_err;
221
222 if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
223 mw->mw_sg, mw->mw_nents, mw->mw_dir))
224 goto out_dmamap_err;
225
226 for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
227 dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
228 rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
229 dma_pages[0]);
216 if (rc) 230 if (rc)
217 goto out_maperr; 231 goto out_maperr;
218 232
219 seg1->rl_mw = mw; 233 mw->mw_handle = mw->fmr.fm_mr->rkey;
220 seg1->mr_rkey = mw->fmr.fmr->rkey; 234 mw->mw_length = len;
221 seg1->mr_base = seg1->mr_dma + pageoff; 235 mw->mw_offset = dma_pages[0] + pageoff;
222 seg1->mr_nsegs = i;
223 seg1->mr_len = len;
224 return i;
225 236
226out_maperr: 237 *out = mw;
227 dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", 238 return mw->mw_nents;
228 __func__, len, (unsigned long long)seg1->mr_dma,
229 pageoff, i, rc);
230 while (i--)
231 rpcrdma_unmap_one(device, --seg);
232 return rc;
233}
234 239
235static void 240out_dmamap_err:
236__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) 241 pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
237{ 242 mw->mw_sg, mw->mw_nents);
238 struct ib_device *device = r_xprt->rx_ia.ri_device; 243 rpcrdma_defer_mr_recovery(mw);
239 int nsegs = seg->mr_nsegs; 244 return -EIO;
240 245
241 while (nsegs--) 246out_maperr:
242 rpcrdma_unmap_one(device, seg++); 247 pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
248 len, (unsigned long long)dma_pages[0],
249 pageoff, mw->mw_nents, rc);
250 rpcrdma_defer_mr_recovery(mw);
251 return -EIO;
243} 252}
244 253
245/* Invalidate all memory regions that were registered for "req". 254/* Invalidate all memory regions that were registered for "req".
246 * 255 *
247 * Sleeps until it is safe for the host CPU to access the 256 * Sleeps until it is safe for the host CPU to access the
248 * previously mapped memory regions. 257 * previously mapped memory regions.
258 *
259 * Caller ensures that req->rl_registered is not empty.
249 */ 260 */
250static void 261static void
251fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 262fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
252{ 263{
253 struct rpcrdma_mr_seg *seg; 264 struct rpcrdma_mw *mw, *tmp;
254 unsigned int i, nchunks;
255 struct rpcrdma_mw *mw;
256 LIST_HEAD(unmap_list); 265 LIST_HEAD(unmap_list);
257 int rc; 266 int rc;
258 267
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
261 /* ORDER: Invalidate all of the req's MRs first 270 /* ORDER: Invalidate all of the req's MRs first
262 * 271 *
263 * ib_unmap_fmr() is slow, so use a single call instead 272 * ib_unmap_fmr() is slow, so use a single call instead
264 * of one call per mapped MR. 273 * of one call per mapped FMR.
265 */ 274 */
266 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 275 list_for_each_entry(mw, &req->rl_registered, mw_list)
267 seg = &req->rl_segments[i]; 276 list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
268 mw = seg->rl_mw;
269
270 list_add(&mw->fmr.fmr->list, &unmap_list);
271
272 i += seg->mr_nsegs;
273 }
274 rc = ib_unmap_fmr(&unmap_list); 277 rc = ib_unmap_fmr(&unmap_list);
275 if (rc) 278 if (rc)
276 pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); 279 goto out_reset;
277 280
278 /* ORDER: Now DMA unmap all of the req's MRs, and return 281 /* ORDER: Now DMA unmap all of the req's MRs, and return
279 * them to the free MW list. 282 * them to the free MW list.
280 */ 283 */
281 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 284 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
282 seg = &req->rl_segments[i]; 285 list_del_init(&mw->mw_list);
286 list_del_init(&mw->fmr.fm_mr->list);
287 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
288 mw->mw_sg, mw->mw_nents, mw->mw_dir);
289 rpcrdma_put_mw(r_xprt, mw);
290 }
283 291
284 __fmr_dma_unmap(r_xprt, seg); 292 return;
285 rpcrdma_put_mw(r_xprt, seg->rl_mw);
286 293
287 i += seg->mr_nsegs; 294out_reset:
288 seg->mr_nsegs = 0; 295 pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
289 seg->rl_mw = NULL;
290 }
291 296
292 req->rl_nchunks = 0; 297 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
298 list_del_init(&mw->fmr.fm_mr->list);
299 fmr_op_recover_mr(mw);
300 }
293} 301}
294 302
295/* Use a slow, safe mechanism to invalidate all memory regions 303/* Use a slow, safe mechanism to invalidate all memory regions
296 * that were registered for "req". 304 * that were registered for "req".
297 *
298 * In the asynchronous case, DMA unmapping occurs first here
299 * because the rpcrdma_mr_seg is released immediately after this
300 * call. It's contents won't be available in __fmr_dma_unmap later.
301 * FIXME.
302 */ 305 */
303static void 306static void
304fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 307fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
305 bool sync) 308 bool sync)
306{ 309{
307 struct rpcrdma_mr_seg *seg;
308 struct rpcrdma_mw *mw; 310 struct rpcrdma_mw *mw;
309 unsigned int i;
310
311 for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
312 seg = &req->rl_segments[i];
313 mw = seg->rl_mw;
314
315 if (sync) {
316 /* ORDER */
317 __fmr_unmap(mw);
318 __fmr_dma_unmap(r_xprt, seg);
319 rpcrdma_put_mw(r_xprt, mw);
320 } else {
321 __fmr_dma_unmap(r_xprt, seg);
322 __fmr_queue_recovery(mw);
323 }
324
325 i += seg->mr_nsegs;
326 seg->mr_nsegs = 0;
327 seg->rl_mw = NULL;
328 }
329}
330
331static void
332fmr_op_destroy(struct rpcrdma_buffer *buf)
333{
334 struct rpcrdma_mw *r;
335 int rc;
336
337 while (!list_empty(&buf->rb_all)) {
338 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
339 list_del(&r->mw_all);
340 kfree(r->fmr.physaddrs);
341 311
342 rc = ib_dealloc_fmr(r->fmr.fmr); 312 while (!list_empty(&req->rl_registered)) {
343 if (rc) 313 mw = list_first_entry(&req->rl_registered,
344 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", 314 struct rpcrdma_mw, mw_list);
345 __func__, rc); 315 list_del_init(&mw->mw_list);
346 316
347 kfree(r); 317 if (sync)
318 fmr_op_recover_mr(mw);
319 else
320 rpcrdma_defer_mr_recovery(mw);
348 } 321 }
349} 322}
350 323
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
352 .ro_map = fmr_op_map, 325 .ro_map = fmr_op_map,
353 .ro_unmap_sync = fmr_op_unmap_sync, 326 .ro_unmap_sync = fmr_op_unmap_sync,
354 .ro_unmap_safe = fmr_op_unmap_safe, 327 .ro_unmap_safe = fmr_op_unmap_safe,
328 .ro_recover_mr = fmr_op_recover_mr,
355 .ro_open = fmr_op_open, 329 .ro_open = fmr_op_open,
356 .ro_maxpages = fmr_op_maxpages, 330 .ro_maxpages = fmr_op_maxpages,
357 .ro_init = fmr_op_init, 331 .ro_init_mr = fmr_op_init_mr,
358 .ro_destroy = fmr_op_destroy, 332 .ro_release_mr = fmr_op_release_mr,
359 .ro_displayname = "fmr", 333 .ro_displayname = "fmr",
360}; 334};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c0947544babe..892b5e1d9b09 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -73,29 +73,71 @@
73# define RPCDBG_FACILITY RPCDBG_TRANS 73# define RPCDBG_FACILITY RPCDBG_TRANS
74#endif 74#endif
75 75
76static struct workqueue_struct *frwr_recovery_wq; 76bool
77 77frwr_is_supported(struct rpcrdma_ia *ia)
78#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) 78{
79 struct ib_device_attr *attrs = &ia->ri_device->attrs;
80
81 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
82 goto out_not_supported;
83 if (attrs->max_fast_reg_page_list_len == 0)
84 goto out_not_supported;
85 return true;
86
87out_not_supported:
88 pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
89 ia->ri_device->name);
90 return false;
91}
79 92
80int 93static int
81frwr_alloc_recovery_wq(void) 94frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
82{ 95{
83 frwr_recovery_wq = alloc_workqueue("frwr_recovery", 96 unsigned int depth = ia->ri_max_frmr_depth;
84 FRWR_RECOVERY_WQ_FLAGS, 0); 97 struct rpcrdma_frmr *f = &r->frmr;
85 return !frwr_recovery_wq ? -ENOMEM : 0; 98 int rc;
99
100 f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
101 if (IS_ERR(f->fr_mr))
102 goto out_mr_err;
103
104 r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
105 if (!r->mw_sg)
106 goto out_list_err;
107
108 sg_init_table(r->mw_sg, depth);
109 init_completion(&f->fr_linv_done);
110 return 0;
111
112out_mr_err:
113 rc = PTR_ERR(f->fr_mr);
114 dprintk("RPC: %s: ib_alloc_mr status %i\n",
115 __func__, rc);
116 return rc;
117
118out_list_err:
119 rc = -ENOMEM;
120 dprintk("RPC: %s: sg allocation failure\n",
121 __func__);
122 ib_dereg_mr(f->fr_mr);
123 return rc;
86} 124}
87 125
88void 126static void
89frwr_destroy_recovery_wq(void) 127frwr_op_release_mr(struct rpcrdma_mw *r)
90{ 128{
91 struct workqueue_struct *wq; 129 int rc;
92 130
93 if (!frwr_recovery_wq) 131 /* Ensure MW is not on any rl_registered list */
94 return; 132 if (!list_empty(&r->mw_list))
133 list_del(&r->mw_list);
95 134
96 wq = frwr_recovery_wq; 135 rc = ib_dereg_mr(r->frmr.fr_mr);
97 frwr_recovery_wq = NULL; 136 if (rc)
98 destroy_workqueue(wq); 137 pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
138 r, rc);
139 kfree(r->mw_sg);
140 kfree(r);
99} 141}
100 142
101static int 143static int
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
124 return 0; 166 return 0;
125} 167}
126 168
127static void 169/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
128__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
129{
130 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
131 struct rpcrdma_frmr *f = &mw->frmr;
132 int rc;
133
134 rc = __frwr_reset_mr(ia, mw);
135 ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
136 if (rc)
137 return;
138
139 rpcrdma_put_mw(r_xprt, mw);
140}
141
142/* Deferred reset of a single FRMR. Generate a fresh rkey by
143 * replacing the MR.
144 * 170 *
145 * There's no recovery if this fails. The FRMR is abandoned, but 171 * There's no recovery if this fails. The FRMR is abandoned, but
146 * remains in rb_all. It will be cleaned up when the transport is 172 * remains in rb_all. It will be cleaned up when the transport is
147 * destroyed. 173 * destroyed.
148 */ 174 */
149static void 175static void
150__frwr_recovery_worker(struct work_struct *work) 176frwr_op_recover_mr(struct rpcrdma_mw *mw)
151{
152 struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
153 mw_work);
154
155 __frwr_reset_and_unmap(r->mw_xprt, r);
156 return;
157}
158
159/* A broken MR was discovered in a context that can't sleep.
160 * Defer recovery to the recovery worker.
161 */
162static void
163__frwr_queue_recovery(struct rpcrdma_mw *r)
164{
165 INIT_WORK(&r->mw_work, __frwr_recovery_worker);
166 queue_work(frwr_recovery_wq, &r->mw_work);
167}
168
169static int
170__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
171 unsigned int depth)
172{ 177{
173 struct rpcrdma_frmr *f = &r->frmr; 178 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
179 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
174 int rc; 180 int rc;
175 181
176 f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); 182 rc = __frwr_reset_mr(ia, mw);
177 if (IS_ERR(f->fr_mr)) 183 ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
178 goto out_mr_err; 184 if (rc)
179 185 goto out_release;
180 f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
181 if (!f->fr_sg)
182 goto out_list_err;
183
184 sg_init_table(f->fr_sg, depth);
185
186 init_completion(&f->fr_linv_done);
187
188 return 0;
189 186
190out_mr_err: 187 rpcrdma_put_mw(r_xprt, mw);
191 rc = PTR_ERR(f->fr_mr); 188 r_xprt->rx_stats.mrs_recovered++;
192 dprintk("RPC: %s: ib_alloc_mr status %i\n", 189 return;
193 __func__, rc);
194 return rc;
195 190
196out_list_err: 191out_release:
197 rc = -ENOMEM; 192 pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
198 dprintk("RPC: %s: sg allocation failure\n", 193 r_xprt->rx_stats.mrs_orphaned++;
199 __func__);
200 ib_dereg_mr(f->fr_mr);
201 return rc;
202}
203 194
204static void 195 spin_lock(&r_xprt->rx_buf.rb_mwlock);
205__frwr_release(struct rpcrdma_mw *r) 196 list_del(&mw->mw_all);
206{ 197 spin_unlock(&r_xprt->rx_buf.rb_mwlock);
207 int rc;
208 198
209 rc = ib_dereg_mr(r->frmr.fr_mr); 199 frwr_op_release_mr(mw);
210 if (rc)
211 dprintk("RPC: %s: ib_dereg_mr status %i\n",
212 __func__, rc);
213 kfree(r->frmr.fr_sg);
214} 200}
215 201
216static int 202static int
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
346 complete_all(&frmr->fr_linv_done); 332 complete_all(&frmr->fr_linv_done);
347} 333}
348 334
349static int 335/* Post a REG_MR Work Request to register a memory region
350frwr_op_init(struct rpcrdma_xprt *r_xprt)
351{
352 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
353 struct ib_device *device = r_xprt->rx_ia.ri_device;
354 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
355 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
356 int i;
357
358 spin_lock_init(&buf->rb_mwlock);
359 INIT_LIST_HEAD(&buf->rb_mws);
360 INIT_LIST_HEAD(&buf->rb_all);
361
362 i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
363 i += 2; /* head + tail */
364 i *= buf->rb_max_requests; /* one set for each RPC slot */
365 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
366
367 while (i--) {
368 struct rpcrdma_mw *r;
369 int rc;
370
371 r = kzalloc(sizeof(*r), GFP_KERNEL);
372 if (!r)
373 return -ENOMEM;
374
375 rc = __frwr_init(r, pd, device, depth);
376 if (rc) {
377 kfree(r);
378 return rc;
379 }
380
381 r->mw_xprt = r_xprt;
382 list_add(&r->mw_list, &buf->rb_mws);
383 list_add(&r->mw_all, &buf->rb_all);
384 }
385
386 return 0;
387}
388
389/* Post a FAST_REG Work Request to register a memory region
390 * for remote access via RDMA READ or RDMA WRITE. 336 * for remote access via RDMA READ or RDMA WRITE.
391 */ 337 */
392static int 338static int
393frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 339frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
394 int nsegs, bool writing) 340 int nsegs, bool writing, struct rpcrdma_mw **out)
395{ 341{
396 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 342 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
397 struct ib_device *device = ia->ri_device;
398 enum dma_data_direction direction = rpcrdma_data_dir(writing);
399 struct rpcrdma_mr_seg *seg1 = seg;
400 struct rpcrdma_mw *mw; 343 struct rpcrdma_mw *mw;
401 struct rpcrdma_frmr *frmr; 344 struct rpcrdma_frmr *frmr;
402 struct ib_mr *mr; 345 struct ib_mr *mr;
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
405 int rc, i, n, dma_nents; 348 int rc, i, n, dma_nents;
406 u8 key; 349 u8 key;
407 350
408 mw = seg1->rl_mw; 351 mw = NULL;
409 seg1->rl_mw = NULL;
410 do { 352 do {
411 if (mw) 353 if (mw)
412 __frwr_queue_recovery(mw); 354 rpcrdma_defer_mr_recovery(mw);
413 mw = rpcrdma_get_mw(r_xprt); 355 mw = rpcrdma_get_mw(r_xprt);
414 if (!mw) 356 if (!mw)
415 return -ENOMEM; 357 return -ENOBUFS;
416 } while (mw->frmr.fr_state != FRMR_IS_INVALID); 358 } while (mw->frmr.fr_state != FRMR_IS_INVALID);
417 frmr = &mw->frmr; 359 frmr = &mw->frmr;
418 frmr->fr_state = FRMR_IS_VALID; 360 frmr->fr_state = FRMR_IS_VALID;
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
421 363
422 if (nsegs > ia->ri_max_frmr_depth) 364 if (nsegs > ia->ri_max_frmr_depth)
423 nsegs = ia->ri_max_frmr_depth; 365 nsegs = ia->ri_max_frmr_depth;
424
425 for (i = 0; i < nsegs;) { 366 for (i = 0; i < nsegs;) {
426 if (seg->mr_page) 367 if (seg->mr_page)
427 sg_set_page(&frmr->fr_sg[i], 368 sg_set_page(&mw->mw_sg[i],
428 seg->mr_page, 369 seg->mr_page,
429 seg->mr_len, 370 seg->mr_len,
430 offset_in_page(seg->mr_offset)); 371 offset_in_page(seg->mr_offset));
431 else 372 else
432 sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, 373 sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
433 seg->mr_len); 374 seg->mr_len);
434 375
435 ++seg; 376 ++seg;
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
440 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 381 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
441 break; 382 break;
442 } 383 }
443 frmr->fr_nents = i; 384 mw->mw_nents = i;
444 frmr->fr_dir = direction; 385 mw->mw_dir = rpcrdma_data_dir(writing);
445 386 if (i == 0)
446 dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); 387 goto out_dmamap_err;
447 if (!dma_nents) {
448 pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
449 __func__, frmr->fr_sg, frmr->fr_nents);
450 return -ENOMEM;
451 }
452 388
453 n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); 389 dma_nents = ib_dma_map_sg(ia->ri_device,
454 if (unlikely(n != frmr->fr_nents)) { 390 mw->mw_sg, mw->mw_nents, mw->mw_dir);
455 pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", 391 if (!dma_nents)
456 __func__, frmr->fr_mr, n, frmr->fr_nents); 392 goto out_dmamap_err;
457 rc = n < 0 ? n : -EINVAL; 393
458 goto out_senderr; 394 n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
459 } 395 if (unlikely(n != mw->mw_nents))
396 goto out_mapmr_err;
460 397
461 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", 398 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
462 __func__, mw, frmr->fr_nents, mr->length); 399 __func__, mw, mw->mw_nents, mr->length);
463 400
464 key = (u8)(mr->rkey & 0x000000FF); 401 key = (u8)(mr->rkey & 0x000000FF);
465 ib_update_fast_reg_key(mr, ++key); 402 ib_update_fast_reg_key(mr, ++key);
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
481 if (rc) 418 if (rc)
482 goto out_senderr; 419 goto out_senderr;
483 420
484 seg1->rl_mw = mw; 421 mw->mw_handle = mr->rkey;
485 seg1->mr_rkey = mr->rkey; 422 mw->mw_length = mr->length;
486 seg1->mr_base = mr->iova; 423 mw->mw_offset = mr->iova;
487 seg1->mr_nsegs = frmr->fr_nents; 424
488 seg1->mr_len = mr->length; 425 *out = mw;
426 return mw->mw_nents;
489 427
490 return frmr->fr_nents; 428out_dmamap_err:
429 pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
430 mw->mw_sg, mw->mw_nents);
431 rpcrdma_defer_mr_recovery(mw);
432 return -EIO;
433
434out_mapmr_err:
435 pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
436 frmr->fr_mr, n, mw->mw_nents);
437 rpcrdma_defer_mr_recovery(mw);
438 return -EIO;
491 439
492out_senderr: 440out_senderr:
493 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); 441 pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
494 __frwr_queue_recovery(mw); 442 rpcrdma_defer_mr_recovery(mw);
495 return rc; 443 return -ENOTCONN;
496} 444}
497 445
498static struct ib_send_wr * 446static struct ib_send_wr *
499__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) 447__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
500{ 448{
501 struct rpcrdma_mw *mw = seg->rl_mw;
502 struct rpcrdma_frmr *f = &mw->frmr; 449 struct rpcrdma_frmr *f = &mw->frmr;
503 struct ib_send_wr *invalidate_wr; 450 struct ib_send_wr *invalidate_wr;
504 451
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
518 * 465 *
519 * Sleeps until it is safe for the host CPU to access the 466 * Sleeps until it is safe for the host CPU to access the
520 * previously mapped memory regions. 467 * previously mapped memory regions.
468 *
469 * Caller ensures that req->rl_registered is not empty.
521 */ 470 */
522static void 471static void
523frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 472frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
524{ 473{
525 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; 474 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
526 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 475 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
527 struct rpcrdma_mr_seg *seg; 476 struct rpcrdma_mw *mw, *tmp;
528 unsigned int i, nchunks;
529 struct rpcrdma_frmr *f; 477 struct rpcrdma_frmr *f;
530 struct rpcrdma_mw *mw;
531 int rc; 478 int rc;
532 479
533 dprintk("RPC: %s: req %p\n", __func__, req); 480 dprintk("RPC: %s: req %p\n", __func__, req);
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
537 * Chain the LOCAL_INV Work Requests and post them with 484 * Chain the LOCAL_INV Work Requests and post them with
538 * a single ib_post_send() call. 485 * a single ib_post_send() call.
539 */ 486 */
487 f = NULL;
540 invalidate_wrs = pos = prev = NULL; 488 invalidate_wrs = pos = prev = NULL;
541 seg = NULL; 489 list_for_each_entry(mw, &req->rl_registered, mw_list) {
542 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 490 pos = __frwr_prepare_linv_wr(mw);
543 seg = &req->rl_segments[i];
544
545 pos = __frwr_prepare_linv_wr(seg);
546 491
547 if (!invalidate_wrs) 492 if (!invalidate_wrs)
548 invalidate_wrs = pos; 493 invalidate_wrs = pos;
549 else 494 else
550 prev->next = pos; 495 prev->next = pos;
551 prev = pos; 496 prev = pos;
552 497 f = &mw->frmr;
553 i += seg->mr_nsegs;
554 } 498 }
555 f = &seg->rl_mw->frmr;
556 499
557 /* Strong send queue ordering guarantees that when the 500 /* Strong send queue ordering guarantees that when the
558 * last WR in the chain completes, all WRs in the chain 501 * last WR in the chain completes, all WRs in the chain
@@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
577 * them to the free MW list. 520 * them to the free MW list.
578 */ 521 */
579unmap: 522unmap:
580 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 523 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
581 seg = &req->rl_segments[i]; 524 list_del_init(&mw->mw_list);
582 mw = seg->rl_mw; 525 ib_dma_unmap_sg(ia->ri_device,
583 seg->rl_mw = NULL; 526 mw->mw_sg, mw->mw_nents, mw->mw_dir);
584
585 ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
586 f->fr_dir);
587 rpcrdma_put_mw(r_xprt, mw); 527 rpcrdma_put_mw(r_xprt, mw);
588
589 i += seg->mr_nsegs;
590 seg->mr_nsegs = 0;
591 } 528 }
592
593 req->rl_nchunks = 0;
594 return; 529 return;
595 530
596reset_mrs: 531reset_mrs:
597 pr_warn("%s: ib_post_send failed %i\n", __func__, rc); 532 pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
533 rdma_disconnect(ia->ri_id);
598 534
599 /* Find and reset the MRs in the LOCAL_INV WRs that did not 535 /* Find and reset the MRs in the LOCAL_INV WRs that did not
600 * get posted. This is synchronous, and slow. 536 * get posted. This is synchronous, and slow.
601 */ 537 */
602 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 538 list_for_each_entry(mw, &req->rl_registered, mw_list) {
603 seg = &req->rl_segments[i];
604 mw = seg->rl_mw;
605 f = &mw->frmr; 539 f = &mw->frmr;
606
607 if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { 540 if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
608 __frwr_reset_mr(ia, mw); 541 __frwr_reset_mr(ia, mw);
609 bad_wr = bad_wr->next; 542 bad_wr = bad_wr->next;
610 } 543 }
611
612 i += seg->mr_nsegs;
613 } 544 }
614 goto unmap; 545 goto unmap;
615} 546}
@@ -621,38 +552,17 @@ static void
621frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 552frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
622 bool sync) 553 bool sync)
623{ 554{
624 struct rpcrdma_mr_seg *seg;
625 struct rpcrdma_mw *mw; 555 struct rpcrdma_mw *mw;
626 unsigned int i;
627 556
628 for (i = 0; req->rl_nchunks; req->rl_nchunks--) { 557 while (!list_empty(&req->rl_registered)) {
629 seg = &req->rl_segments[i]; 558 mw = list_first_entry(&req->rl_registered,
630 mw = seg->rl_mw; 559 struct rpcrdma_mw, mw_list);
560 list_del_init(&mw->mw_list);
631 561
632 if (sync) 562 if (sync)
633 __frwr_reset_and_unmap(r_xprt, mw); 563 frwr_op_recover_mr(mw);
634 else 564 else
635 __frwr_queue_recovery(mw); 565 rpcrdma_defer_mr_recovery(mw);
636
637 i += seg->mr_nsegs;
638 seg->mr_nsegs = 0;
639 seg->rl_mw = NULL;
640 }
641}
642
643static void
644frwr_op_destroy(struct rpcrdma_buffer *buf)
645{
646 struct rpcrdma_mw *r;
647
648 /* Ensure stale MWs for "buf" are no longer in flight */
649 flush_workqueue(frwr_recovery_wq);
650
651 while (!list_empty(&buf->rb_all)) {
652 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
653 list_del(&r->mw_all);
654 __frwr_release(r);
655 kfree(r);
656 } 566 }
657} 567}
658 568
@@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
660 .ro_map = frwr_op_map, 570 .ro_map = frwr_op_map,
661 .ro_unmap_sync = frwr_op_unmap_sync, 571 .ro_unmap_sync = frwr_op_unmap_sync,
662 .ro_unmap_safe = frwr_op_unmap_safe, 572 .ro_unmap_safe = frwr_op_unmap_safe,
573 .ro_recover_mr = frwr_op_recover_mr,
663 .ro_open = frwr_op_open, 574 .ro_open = frwr_op_open,
664 .ro_maxpages = frwr_op_maxpages, 575 .ro_maxpages = frwr_op_maxpages,
665 .ro_init = frwr_op_init, 576 .ro_init_mr = frwr_op_init_mr,
666 .ro_destroy = frwr_op_destroy, 577 .ro_release_mr = frwr_op_release_mr,
667 .ro_displayname = "frwr", 578 .ro_displayname = "frwr",
668}; 579};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
deleted file mode 100644
index 3750596cc432..000000000000
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ /dev/null
@@ -1,122 +0,0 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* No-op chunk preparation. All client memory is pre-registered.
7 * Sometimes referred to as ALLPHYSICAL mode.
8 *
9 * Physical registration is simple because all client memory is
10 * pre-registered and never deregistered. This mode is good for
11 * adapter bring up, but is considered not safe: the server is
12 * trusted not to abuse its access to client memory not involved
13 * in RDMA I/O.
14 */
15
16#include "xprt_rdma.h"
17
18#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
19# define RPCDBG_FACILITY RPCDBG_TRANS
20#endif
21
22static int
23physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
24 struct rpcrdma_create_data_internal *cdata)
25{
26 struct ib_mr *mr;
27
28 /* Obtain an rkey to use for RPC data payloads.
29 */
30 mr = ib_get_dma_mr(ia->ri_pd,
31 IB_ACCESS_LOCAL_WRITE |
32 IB_ACCESS_REMOTE_WRITE |
33 IB_ACCESS_REMOTE_READ);
34 if (IS_ERR(mr)) {
35 pr_err("%s: ib_get_dma_mr for failed with %lX\n",
36 __func__, PTR_ERR(mr));
37 return -ENOMEM;
38 }
39 ia->ri_dma_mr = mr;
40
41 rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
42 RPCRDMA_MAX_DATA_SEGS,
43 RPCRDMA_MAX_HDR_SEGS));
44 return 0;
45}
46
47/* PHYSICAL memory registration conveys one page per chunk segment.
48 */
49static size_t
50physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
51{
52 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
53 RPCRDMA_MAX_HDR_SEGS);
54}
55
56static int
57physical_op_init(struct rpcrdma_xprt *r_xprt)
58{
59 return 0;
60}
61
62/* The client's physical memory is already exposed for
63 * remote access via RDMA READ or RDMA WRITE.
64 */
65static int
66physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
67 int nsegs, bool writing)
68{
69 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
70
71 rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
72 seg->mr_rkey = ia->ri_dma_mr->rkey;
73 seg->mr_base = seg->mr_dma;
74 return 1;
75}
76
77/* DMA unmap all memory regions that were mapped for "req".
78 */
79static void
80physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
81{
82 struct ib_device *device = r_xprt->rx_ia.ri_device;
83 unsigned int i;
84
85 for (i = 0; req->rl_nchunks; --req->rl_nchunks)
86 rpcrdma_unmap_one(device, &req->rl_segments[i++]);
87}
88
89/* Use a slow, safe mechanism to invalidate all memory regions
90 * that were registered for "req".
91 *
92 * For physical memory registration, there is no good way to
93 * fence a single MR that has been advertised to the server. The
94 * client has already handed the server an R_key that cannot be
95 * invalidated and is shared by all MRs on this connection.
96 * Tearing down the PD might be the only safe choice, but it's
97 * not clear that a freshly acquired DMA R_key would be different
98 * than the one used by the PD that was just destroyed.
99 * FIXME.
100 */
101static void
102physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
103 bool sync)
104{
105 physical_op_unmap_sync(r_xprt, req);
106}
107
108static void
109physical_op_destroy(struct rpcrdma_buffer *buf)
110{
111}
112
113const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
114 .ro_map = physical_op_map,
115 .ro_unmap_sync = physical_op_unmap_sync,
116 .ro_unmap_safe = physical_op_unmap_safe,
117 .ro_open = physical_op_open,
118 .ro_maxpages = physical_op_maxpages,
119 .ro_init = physical_op_init,
120 .ro_destroy = physical_op_destroy,
121 .ro_displayname = "physical",
122};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 35a81096e83d..a47f170b20ef 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
196 * MR when they can. 196 * MR when they can.
197 */ 197 */
198static int 198static int
199rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 199rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
200 int n, int nsegs)
201{ 200{
202 size_t page_offset; 201 size_t page_offset;
203 u32 remaining; 202 u32 remaining;
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
206 base = vec->iov_base; 205 base = vec->iov_base;
207 page_offset = offset_in_page(base); 206 page_offset = offset_in_page(base);
208 remaining = vec->iov_len; 207 remaining = vec->iov_len;
209 while (remaining && n < nsegs) { 208 while (remaining && n < RPCRDMA_MAX_SEGS) {
210 seg[n].mr_page = NULL; 209 seg[n].mr_page = NULL;
211 seg[n].mr_offset = base; 210 seg[n].mr_offset = base;
212 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 211 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
230 229
231static int 230static int
232rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, 231rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
233 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) 232 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
234{ 233{
235 int len, n = 0, p; 234 int len, n, p, page_base;
236 int page_base;
237 struct page **ppages; 235 struct page **ppages;
238 236
237 n = 0;
239 if (pos == 0) { 238 if (pos == 0) {
240 n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); 239 n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
241 if (n == nsegs) 240 if (n == RPCRDMA_MAX_SEGS)
242 return -EIO; 241 goto out_overflow;
243 } 242 }
244 243
245 len = xdrbuf->page_len; 244 len = xdrbuf->page_len;
246 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 245 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
247 page_base = xdrbuf->page_base & ~PAGE_MASK; 246 page_base = xdrbuf->page_base & ~PAGE_MASK;
248 p = 0; 247 p = 0;
249 while (len && n < nsegs) { 248 while (len && n < RPCRDMA_MAX_SEGS) {
250 if (!ppages[p]) { 249 if (!ppages[p]) {
251 /* alloc the pagelist for receiving buffer */ 250 /* alloc the pagelist for receiving buffer */
252 ppages[p] = alloc_page(GFP_ATOMIC); 251 ppages[p] = alloc_page(GFP_ATOMIC);
253 if (!ppages[p]) 252 if (!ppages[p])
254 return -ENOMEM; 253 return -EAGAIN;
255 } 254 }
256 seg[n].mr_page = ppages[p]; 255 seg[n].mr_page = ppages[p];
257 seg[n].mr_offset = (void *)(unsigned long) page_base; 256 seg[n].mr_offset = (void *)(unsigned long) page_base;
258 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); 257 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
259 if (seg[n].mr_len > PAGE_SIZE) 258 if (seg[n].mr_len > PAGE_SIZE)
260 return -EIO; 259 goto out_overflow;
261 len -= seg[n].mr_len; 260 len -= seg[n].mr_len;
262 ++n; 261 ++n;
263 ++p; 262 ++p;
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
265 } 264 }
266 265
267 /* Message overflows the seg array */ 266 /* Message overflows the seg array */
268 if (len && n == nsegs) 267 if (len && n == RPCRDMA_MAX_SEGS)
269 return -EIO; 268 goto out_overflow;
270 269
271 /* When encoding the read list, the tail is always sent inline */ 270 /* When encoding the read list, the tail is always sent inline */
272 if (type == rpcrdma_readch) 271 if (type == rpcrdma_readch)
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
277 * xdr pad bytes, saving the server an RDMA operation. */ 276 * xdr pad bytes, saving the server an RDMA operation. */
278 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) 277 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
279 return n; 278 return n;
280 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); 279 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
281 if (n == nsegs) 280 if (n == RPCRDMA_MAX_SEGS)
282 return -EIO; 281 goto out_overflow;
283 } 282 }
284 283
285 return n; 284 return n;
285
286out_overflow:
287 pr_err("rpcrdma: segment array overflow\n");
288 return -EIO;
286} 289}
287 290
288static inline __be32 * 291static inline __be32 *
289xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) 292xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
290{ 293{
291 *iptr++ = cpu_to_be32(seg->mr_rkey); 294 *iptr++ = cpu_to_be32(mw->mw_handle);
292 *iptr++ = cpu_to_be32(seg->mr_len); 295 *iptr++ = cpu_to_be32(mw->mw_length);
293 return xdr_encode_hyper(iptr, seg->mr_base); 296 return xdr_encode_hyper(iptr, mw->mw_offset);
294} 297}
295 298
296/* XDR-encode the Read list. Supports encoding a list of read 299/* XDR-encode the Read list. Supports encoding a list of read
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
310 struct rpcrdma_req *req, struct rpc_rqst *rqst, 313 struct rpcrdma_req *req, struct rpc_rqst *rqst,
311 __be32 *iptr, enum rpcrdma_chunktype rtype) 314 __be32 *iptr, enum rpcrdma_chunktype rtype)
312{ 315{
313 struct rpcrdma_mr_seg *seg = req->rl_nextseg; 316 struct rpcrdma_mr_seg *seg;
317 struct rpcrdma_mw *mw;
314 unsigned int pos; 318 unsigned int pos;
315 int n, nsegs; 319 int n, nsegs;
316 320
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
322 pos = rqst->rq_snd_buf.head[0].iov_len; 326 pos = rqst->rq_snd_buf.head[0].iov_len;
323 if (rtype == rpcrdma_areadch) 327 if (rtype == rpcrdma_areadch)
324 pos = 0; 328 pos = 0;
325 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, 329 seg = req->rl_segments;
326 RPCRDMA_MAX_SEGS - req->rl_nchunks); 330 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
327 if (nsegs < 0) 331 if (nsegs < 0)
328 return ERR_PTR(nsegs); 332 return ERR_PTR(nsegs);
329 333
330 do { 334 do {
331 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); 335 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
332 if (n <= 0) 336 false, &mw);
337 if (n < 0)
333 return ERR_PTR(n); 338 return ERR_PTR(n);
339 list_add(&mw->mw_list, &req->rl_registered);
334 340
335 *iptr++ = xdr_one; /* item present */ 341 *iptr++ = xdr_one; /* item present */
336 342
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
338 * have the same "position". 344 * have the same "position".
339 */ 345 */
340 *iptr++ = cpu_to_be32(pos); 346 *iptr++ = cpu_to_be32(pos);
341 iptr = xdr_encode_rdma_segment(iptr, seg); 347 iptr = xdr_encode_rdma_segment(iptr, mw);
342 348
343 dprintk("RPC: %5u %s: read segment pos %u " 349 dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
344 "%d@0x%016llx:0x%08x (%s)\n",
345 rqst->rq_task->tk_pid, __func__, pos, 350 rqst->rq_task->tk_pid, __func__, pos,
346 seg->mr_len, (unsigned long long)seg->mr_base, 351 mw->mw_length, (unsigned long long)mw->mw_offset,
347 seg->mr_rkey, n < nsegs ? "more" : "last"); 352 mw->mw_handle, n < nsegs ? "more" : "last");
348 353
349 r_xprt->rx_stats.read_chunk_count++; 354 r_xprt->rx_stats.read_chunk_count++;
350 req->rl_nchunks++;
351 seg += n; 355 seg += n;
352 nsegs -= n; 356 nsegs -= n;
353 } while (nsegs); 357 } while (nsegs);
354 req->rl_nextseg = seg;
355 358
356 /* Finish Read list */ 359 /* Finish Read list */
357 *iptr++ = xdr_zero; /* Next item not present */ 360 *iptr++ = xdr_zero; /* Next item not present */
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
375 struct rpc_rqst *rqst, __be32 *iptr, 378 struct rpc_rqst *rqst, __be32 *iptr,
376 enum rpcrdma_chunktype wtype) 379 enum rpcrdma_chunktype wtype)
377{ 380{
378 struct rpcrdma_mr_seg *seg = req->rl_nextseg; 381 struct rpcrdma_mr_seg *seg;
382 struct rpcrdma_mw *mw;
379 int n, nsegs, nchunks; 383 int n, nsegs, nchunks;
380 __be32 *segcount; 384 __be32 *segcount;
381 385
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
384 return iptr; 388 return iptr;
385 } 389 }
386 390
391 seg = req->rl_segments;
387 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 392 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
388 rqst->rq_rcv_buf.head[0].iov_len, 393 rqst->rq_rcv_buf.head[0].iov_len,
389 wtype, seg, 394 wtype, seg);
390 RPCRDMA_MAX_SEGS - req->rl_nchunks);
391 if (nsegs < 0) 395 if (nsegs < 0)
392 return ERR_PTR(nsegs); 396 return ERR_PTR(nsegs);
393 397
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
396 400
397 nchunks = 0; 401 nchunks = 0;
398 do { 402 do {
399 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); 403 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
400 if (n <= 0) 404 true, &mw);
405 if (n < 0)
401 return ERR_PTR(n); 406 return ERR_PTR(n);
407 list_add(&mw->mw_list, &req->rl_registered);
402 408
403 iptr = xdr_encode_rdma_segment(iptr, seg); 409 iptr = xdr_encode_rdma_segment(iptr, mw);
404 410
405 dprintk("RPC: %5u %s: write segment " 411 dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
406 "%d@0x016%llx:0x%08x (%s)\n",
407 rqst->rq_task->tk_pid, __func__, 412 rqst->rq_task->tk_pid, __func__,
408 seg->mr_len, (unsigned long long)seg->mr_base, 413 mw->mw_length, (unsigned long long)mw->mw_offset,
409 seg->mr_rkey, n < nsegs ? "more" : "last"); 414 mw->mw_handle, n < nsegs ? "more" : "last");
410 415
411 r_xprt->rx_stats.write_chunk_count++; 416 r_xprt->rx_stats.write_chunk_count++;
412 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 417 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
413 req->rl_nchunks++;
414 nchunks++; 418 nchunks++;
415 seg += n; 419 seg += n;
416 nsegs -= n; 420 nsegs -= n;
417 } while (nsegs); 421 } while (nsegs);
418 req->rl_nextseg = seg;
419 422
420 /* Update count of segments in this Write chunk */ 423 /* Update count of segments in this Write chunk */
421 *segcount = cpu_to_be32(nchunks); 424 *segcount = cpu_to_be32(nchunks);
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
442 struct rpcrdma_req *req, struct rpc_rqst *rqst, 445 struct rpcrdma_req *req, struct rpc_rqst *rqst,
443 __be32 *iptr, enum rpcrdma_chunktype wtype) 446 __be32 *iptr, enum rpcrdma_chunktype wtype)
444{ 447{
445 struct rpcrdma_mr_seg *seg = req->rl_nextseg; 448 struct rpcrdma_mr_seg *seg;
449 struct rpcrdma_mw *mw;
446 int n, nsegs, nchunks; 450 int n, nsegs, nchunks;
447 __be32 *segcount; 451 __be32 *segcount;
448 452
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
451 return iptr; 455 return iptr;
452 } 456 }
453 457
454 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, 458 seg = req->rl_segments;
455 RPCRDMA_MAX_SEGS - req->rl_nchunks); 459 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
456 if (nsegs < 0) 460 if (nsegs < 0)
457 return ERR_PTR(nsegs); 461 return ERR_PTR(nsegs);
458 462
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
461 465
462 nchunks = 0; 466 nchunks = 0;
463 do { 467 do {
464 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); 468 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
465 if (n <= 0) 469 true, &mw);
470 if (n < 0)
466 return ERR_PTR(n); 471 return ERR_PTR(n);
472 list_add(&mw->mw_list, &req->rl_registered);
467 473
468 iptr = xdr_encode_rdma_segment(iptr, seg); 474 iptr = xdr_encode_rdma_segment(iptr, mw);
469 475
470 dprintk("RPC: %5u %s: reply segment " 476 dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
471 "%d@0x%016llx:0x%08x (%s)\n",
472 rqst->rq_task->tk_pid, __func__, 477 rqst->rq_task->tk_pid, __func__,
473 seg->mr_len, (unsigned long long)seg->mr_base, 478 mw->mw_length, (unsigned long long)mw->mw_offset,
474 seg->mr_rkey, n < nsegs ? "more" : "last"); 479 mw->mw_handle, n < nsegs ? "more" : "last");
475 480
476 r_xprt->rx_stats.reply_chunk_count++; 481 r_xprt->rx_stats.reply_chunk_count++;
477 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 482 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
478 req->rl_nchunks++;
479 nchunks++; 483 nchunks++;
480 seg += n; 484 seg += n;
481 nsegs -= n; 485 nsegs -= n;
482 } while (nsegs); 486 } while (nsegs);
483 req->rl_nextseg = seg;
484 487
485 /* Update count of segments in the Reply chunk */ 488 /* Update count of segments in the Reply chunk */
486 *segcount = cpu_to_be32(nchunks); 489 *segcount = cpu_to_be32(nchunks);
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
567 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 570 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
568 enum rpcrdma_chunktype rtype, wtype; 571 enum rpcrdma_chunktype rtype, wtype;
569 struct rpcrdma_msg *headerp; 572 struct rpcrdma_msg *headerp;
573 bool ddp_allowed;
570 ssize_t hdrlen; 574 ssize_t hdrlen;
571 size_t rpclen; 575 size_t rpclen;
572 __be32 *iptr; 576 __be32 *iptr;
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
583 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 587 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
584 headerp->rm_type = rdma_msg; 588 headerp->rm_type = rdma_msg;
585 589
590 /* When the ULP employs a GSS flavor that guarantees integrity
591 * or privacy, direct data placement of individual data items
592 * is not allowed.
593 */
594 ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
595 RPCAUTH_AUTH_DATATOUCH);
596
586 /* 597 /*
587 * Chunks needed for results? 598 * Chunks needed for results?
588 * 599 *
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
594 */ 605 */
595 if (rpcrdma_results_inline(r_xprt, rqst)) 606 if (rpcrdma_results_inline(r_xprt, rqst))
596 wtype = rpcrdma_noch; 607 wtype = rpcrdma_noch;
597 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 608 else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
598 wtype = rpcrdma_writech; 609 wtype = rpcrdma_writech;
599 else 610 else
600 wtype = rpcrdma_replych; 611 wtype = rpcrdma_replych;
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
617 rtype = rpcrdma_noch; 628 rtype = rpcrdma_noch;
618 rpcrdma_inline_pullup(rqst); 629 rpcrdma_inline_pullup(rqst);
619 rpclen = rqst->rq_svec[0].iov_len; 630 rpclen = rqst->rq_svec[0].iov_len;
620 } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 631 } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
621 rtype = rpcrdma_readch; 632 rtype = rpcrdma_readch;
622 rpclen = rqst->rq_svec[0].iov_len; 633 rpclen = rqst->rq_svec[0].iov_len;
623 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); 634 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
650 * send a Call message with a Position Zero Read chunk and a 661 * send a Call message with a Position Zero Read chunk and a
651 * regular Read chunk at the same time. 662 * regular Read chunk at the same time.
652 */ 663 */
653 req->rl_nchunks = 0;
654 req->rl_nextseg = req->rl_segments;
655 iptr = headerp->rm_body.rm_chunks; 664 iptr = headerp->rm_body.rm_chunks;
656 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); 665 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
657 if (IS_ERR(iptr)) 666 if (IS_ERR(iptr))
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
690out_overflow: 699out_overflow:
691 pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", 700 pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
692 hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); 701 hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
693 /* Terminate this RPC. Chunks registered above will be 702 iptr = ERR_PTR(-EIO);
694 * released by xprt_release -> xprt_rmda_free .
695 */
696 return -EIO;
697 703
698out_unmap: 704out_unmap:
699 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 705 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -705,15 +711,13 @@ out_unmap:
705 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) 711 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
706 */ 712 */
707static int 713static int
708rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) 714rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
709{ 715{
710 unsigned int i, total_len; 716 unsigned int i, total_len;
711 struct rpcrdma_write_chunk *cur_wchunk; 717 struct rpcrdma_write_chunk *cur_wchunk;
712 char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); 718 char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
713 719
714 i = be32_to_cpu(**iptrp); 720 i = be32_to_cpu(**iptrp);
715 if (i > max)
716 return -1;
717 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 721 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
718 total_len = 0; 722 total_len = 0;
719 while (i--) { 723 while (i--) {
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
744 return total_len; 748 return total_len;
745} 749}
746 750
747/* 751/**
748 * Scatter inline received data back into provided iov's. 752 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
753 * @rqst: controlling RPC request
754 * @srcp: points to RPC message payload in receive buffer
755 * @copy_len: remaining length of receive buffer content
756 * @pad: Write chunk pad bytes needed (zero for pure inline)
757 *
758 * The upper layer has set the maximum number of bytes it can
759 * receive in each component of rq_rcv_buf. These values are set in
760 * the head.iov_len, page_len, tail.iov_len, and buflen fields.
761 *
762 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
763 * many cases this function simply updates iov_base pointers in
764 * rq_rcv_buf to point directly to the received reply data, to
765 * avoid copying reply data.
766 *
767 * Returns the count of bytes which had to be memcopied.
749 */ 768 */
750static void 769static unsigned long
751rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 770rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
752{ 771{
753 int i, npages, curlen, olen; 772 unsigned long fixup_copy_count;
773 int i, npages, curlen;
754 char *destp; 774 char *destp;
755 struct page **ppages; 775 struct page **ppages;
756 int page_base; 776 int page_base;
757 777
778 /* The head iovec is redirected to the RPC reply message
779 * in the receive buffer, to avoid a memcopy.
780 */
781 rqst->rq_rcv_buf.head[0].iov_base = srcp;
782 rqst->rq_private_buf.head[0].iov_base = srcp;
783
784 /* The contents of the receive buffer that follow
785 * head.iov_len bytes are copied into the page list.
786 */
758 curlen = rqst->rq_rcv_buf.head[0].iov_len; 787 curlen = rqst->rq_rcv_buf.head[0].iov_len;
759 if (curlen > copy_len) { /* write chunk header fixup */ 788 if (curlen > copy_len)
760 curlen = copy_len; 789 curlen = copy_len;
761 rqst->rq_rcv_buf.head[0].iov_len = curlen;
762 }
763
764 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", 790 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
765 __func__, srcp, copy_len, curlen); 791 __func__, srcp, copy_len, curlen);
766
767 /* Shift pointer for first receive segment only */
768 rqst->rq_rcv_buf.head[0].iov_base = srcp;
769 srcp += curlen; 792 srcp += curlen;
770 copy_len -= curlen; 793 copy_len -= curlen;
771 794
772 olen = copy_len;
773 i = 0;
774 rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
775 page_base = rqst->rq_rcv_buf.page_base; 795 page_base = rqst->rq_rcv_buf.page_base;
776 ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); 796 ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
777 page_base &= ~PAGE_MASK; 797 page_base &= ~PAGE_MASK;
778 798 fixup_copy_count = 0;
779 if (copy_len && rqst->rq_rcv_buf.page_len) { 799 if (copy_len && rqst->rq_rcv_buf.page_len) {
780 npages = PAGE_ALIGN(page_base + 800 int pagelist_len;
781 rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; 801
782 for (; i < npages; i++) { 802 pagelist_len = rqst->rq_rcv_buf.page_len;
803 if (pagelist_len > copy_len)
804 pagelist_len = copy_len;
805 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
806 for (i = 0; i < npages; i++) {
783 curlen = PAGE_SIZE - page_base; 807 curlen = PAGE_SIZE - page_base;
784 if (curlen > copy_len) 808 if (curlen > pagelist_len)
785 curlen = copy_len; 809 curlen = pagelist_len;
810
786 dprintk("RPC: %s: page %d" 811 dprintk("RPC: %s: page %d"
787 " srcp 0x%p len %d curlen %d\n", 812 " srcp 0x%p len %d curlen %d\n",
788 __func__, i, srcp, copy_len, curlen); 813 __func__, i, srcp, copy_len, curlen);
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
792 kunmap_atomic(destp); 817 kunmap_atomic(destp);
793 srcp += curlen; 818 srcp += curlen;
794 copy_len -= curlen; 819 copy_len -= curlen;
795 if (copy_len == 0) 820 fixup_copy_count += curlen;
821 pagelist_len -= curlen;
822 if (!pagelist_len)
796 break; 823 break;
797 page_base = 0; 824 page_base = 0;
798 } 825 }
799 }
800 826
801 if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { 827 /* Implicit padding for the last segment in a Write
802 curlen = copy_len; 828 * chunk is inserted inline at the front of the tail
803 if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) 829 * iovec. The upper layer ignores the content of
804 curlen = rqst->rq_rcv_buf.tail[0].iov_len; 830 * the pad. Simply ensure inline content in the tail
805 if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) 831 * that follows the Write chunk is properly aligned.
806 memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); 832 */
807 dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", 833 if (pad)
808 __func__, srcp, copy_len, curlen); 834 srcp -= pad;
809 rqst->rq_rcv_buf.tail[0].iov_len = curlen;
810 copy_len -= curlen; ++i;
811 } else
812 rqst->rq_rcv_buf.tail[0].iov_len = 0;
813
814 if (pad) {
815 /* implicit padding on terminal chunk */
816 unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
817 while (pad--)
818 p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
819 } 835 }
820 836
821 if (copy_len) 837 /* The tail iovec is redirected to the remaining data
822 dprintk("RPC: %s: %d bytes in" 838 * in the receive buffer, to avoid a memcopy.
823 " %d extra segments (%d lost)\n", 839 */
824 __func__, olen, i, copy_len); 840 if (copy_len || pad) {
841 rqst->rq_rcv_buf.tail[0].iov_base = srcp;
842 rqst->rq_private_buf.tail[0].iov_base = srcp;
843 }
825 844
826 /* TBD avoid a warning from call_decode() */ 845 return fixup_copy_count;
827 rqst->rq_private_buf = rqst->rq_rcv_buf;
828} 846}
829 847
830void 848void
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
960 (headerp->rm_body.rm_chunks[1] == xdr_zero && 978 (headerp->rm_body.rm_chunks[1] == xdr_zero &&
961 headerp->rm_body.rm_chunks[2] != xdr_zero) || 979 headerp->rm_body.rm_chunks[2] != xdr_zero) ||
962 (headerp->rm_body.rm_chunks[1] != xdr_zero && 980 (headerp->rm_body.rm_chunks[1] != xdr_zero &&
963 req->rl_nchunks == 0)) 981 list_empty(&req->rl_registered)))
964 goto badheader; 982 goto badheader;
965 if (headerp->rm_body.rm_chunks[1] != xdr_zero) { 983 if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
966 /* count any expected write chunks in read reply */ 984 /* count any expected write chunks in read reply */
967 /* start at write chunk array count */ 985 /* start at write chunk array count */
968 iptr = &headerp->rm_body.rm_chunks[2]; 986 iptr = &headerp->rm_body.rm_chunks[2];
969 rdmalen = rpcrdma_count_chunks(rep, 987 rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
970 req->rl_nchunks, 1, &iptr);
971 /* check for validity, and no reply chunk after */ 988 /* check for validity, and no reply chunk after */
972 if (rdmalen < 0 || *iptr++ != xdr_zero) 989 if (rdmalen < 0 || *iptr++ != xdr_zero)
973 goto badheader; 990 goto badheader;
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
988 rep->rr_len -= RPCRDMA_HDRLEN_MIN; 1005 rep->rr_len -= RPCRDMA_HDRLEN_MIN;
989 status = rep->rr_len; 1006 status = rep->rr_len;
990 } 1007 }
991 /* Fix up the rpc results for upper layer */ 1008
992 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); 1009 r_xprt->rx_stats.fixup_copy_count +=
1010 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
1011 rdmalen);
993 break; 1012 break;
994 1013
995 case rdma_nomsg: 1014 case rdma_nomsg:
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
997 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 1016 if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
998 headerp->rm_body.rm_chunks[1] != xdr_zero || 1017 headerp->rm_body.rm_chunks[1] != xdr_zero ||
999 headerp->rm_body.rm_chunks[2] != xdr_one || 1018 headerp->rm_body.rm_chunks[2] != xdr_one ||
1000 req->rl_nchunks == 0) 1019 list_empty(&req->rl_registered))
1001 goto badheader; 1020 goto badheader;
1002 iptr = (__be32 *)((unsigned char *)headerp + 1021 iptr = (__be32 *)((unsigned char *)headerp +
1003 RPCRDMA_HDRLEN_MIN); 1022 RPCRDMA_HDRLEN_MIN);
1004 rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); 1023 rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
1005 if (rdmalen < 0) 1024 if (rdmalen < 0)
1006 goto badheader; 1025 goto badheader;
1007 r_xprt->rx_stats.total_rdma_reply += rdmalen; 1026 r_xprt->rx_stats.total_rdma_reply += rdmalen;
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1014 1033
1015badheader: 1034badheader:
1016 default: 1035 default:
1017 dprintk("%s: invalid rpcrdma reply header (type %d):" 1036 dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
1018 " chunks[012] == %d %d %d" 1037 rqst->rq_task->tk_pid, __func__,
1019 " expected chunks <= %d\n", 1038 be32_to_cpu(headerp->rm_type));
1020 __func__, be32_to_cpu(headerp->rm_type),
1021 headerp->rm_body.rm_chunks[0],
1022 headerp->rm_body.rm_chunks[1],
1023 headerp->rm_body.rm_chunks[2],
1024 req->rl_nchunks);
1025 status = -EIO; 1039 status = -EIO;
1026 r_xprt->rx_stats.bad_reply_count++; 1040 r_xprt->rx_stats.bad_reply_count++;
1027 break; 1041 break;
@@ -1035,7 +1049,7 @@ out:
1035 * control: waking the next RPC waits until this RPC has 1049 * control: waking the next RPC waits until this RPC has
1036 * relinquished all its Send Queue entries. 1050 * relinquished all its Send Queue entries.
1037 */ 1051 */
1038 if (req->rl_nchunks) 1052 if (!list_empty(&req->rl_registered))
1039 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); 1053 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
1040 1054
1041 spin_lock_bh(&xprt->transport_lock); 1055 spin_lock_bh(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 99d2e5b72726..81f0e879f019 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -558,7 +558,6 @@ out_sendbuf:
558 558
559out_fail: 559out_fail:
560 rpcrdma_buffer_put(req); 560 rpcrdma_buffer_put(req);
561 r_xprt->rx_stats.failed_marshal_count++;
562 return NULL; 561 return NULL;
563} 562}
564 563
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
590 rpcrdma_buffer_put(req); 589 rpcrdma_buffer_put(req);
591} 590}
592 591
593/* 592/**
593 * xprt_rdma_send_request - marshal and send an RPC request
594 * @task: RPC task with an RPC message in rq_snd_buf
595 *
596 * Return values:
597 * 0: The request has been sent
598 * ENOTCONN: Caller needs to invoke connect logic then call again
599 * ENOBUFS: Call again later to send the request
600 * EIO: A permanent error occurred. The request was not sent,
601 * and don't try it again
602 *
594 * send_request invokes the meat of RPC RDMA. It must do the following: 603 * send_request invokes the meat of RPC RDMA. It must do the following:
604 *
595 * 1. Marshal the RPC request into an RPC RDMA request, which means 605 * 1. Marshal the RPC request into an RPC RDMA request, which means
596 * putting a header in front of data, and creating IOVs for RDMA 606 * putting a header in front of data, and creating IOVs for RDMA
597 * from those in the request. 607 * from those in the request.
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
600 * the request (rpcrdma_ep_post). 610 * the request (rpcrdma_ep_post).
601 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). 611 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
602 */ 612 */
603
604static int 613static int
605xprt_rdma_send_request(struct rpc_task *task) 614xprt_rdma_send_request(struct rpc_task *task)
606{ 615{
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
610 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 619 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
611 int rc = 0; 620 int rc = 0;
612 621
622 /* On retransmit, remove any previously registered chunks */
623 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
624
613 rc = rpcrdma_marshal_req(rqst); 625 rc = rpcrdma_marshal_req(rqst);
614 if (rc < 0) 626 if (rc < 0)
615 goto failed_marshal; 627 goto failed_marshal;
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
630 return 0; 642 return 0;
631 643
632failed_marshal: 644failed_marshal:
633 r_xprt->rx_stats.failed_marshal_count++;
634 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", 645 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
635 __func__, rc); 646 __func__, rc);
636 if (rc == -EIO) 647 if (rc == -EIO)
637 return -EIO; 648 r_xprt->rx_stats.failed_marshal_count++;
649 if (rc != -ENOTCONN)
650 return rc;
638drop_connection: 651drop_connection:
639 xprt_disconnect_done(xprt); 652 xprt_disconnect_done(xprt);
640 return -ENOTCONN; /* implies disconnect */ 653 return -ENOTCONN; /* implies disconnect */
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
660 xprt->stat.bad_xids, 673 xprt->stat.bad_xids,
661 xprt->stat.req_u, 674 xprt->stat.req_u,
662 xprt->stat.bklog_u); 675 xprt->stat.bklog_u);
663 seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", 676 seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
664 r_xprt->rx_stats.read_chunk_count, 677 r_xprt->rx_stats.read_chunk_count,
665 r_xprt->rx_stats.write_chunk_count, 678 r_xprt->rx_stats.write_chunk_count,
666 r_xprt->rx_stats.reply_chunk_count, 679 r_xprt->rx_stats.reply_chunk_count,
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
672 r_xprt->rx_stats.failed_marshal_count, 685 r_xprt->rx_stats.failed_marshal_count,
673 r_xprt->rx_stats.bad_reply_count, 686 r_xprt->rx_stats.bad_reply_count,
674 r_xprt->rx_stats.nomsg_call_count); 687 r_xprt->rx_stats.nomsg_call_count);
688 seq_printf(seq, "%lu %lu %lu\n",
689 r_xprt->rx_stats.mrs_recovered,
690 r_xprt->rx_stats.mrs_orphaned,
691 r_xprt->rx_stats.mrs_allocated);
675} 692}
676 693
677static int 694static int
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
741 __func__, rc); 758 __func__, rc);
742 759
743 rpcrdma_destroy_wq(); 760 rpcrdma_destroy_wq();
744 frwr_destroy_recovery_wq();
745 761
746 rc = xprt_unregister_transport(&xprt_rdma_bc); 762 rc = xprt_unregister_transport(&xprt_rdma_bc);
747 if (rc) 763 if (rc)
@@ -753,20 +769,13 @@ int xprt_rdma_init(void)
753{ 769{
754 int rc; 770 int rc;
755 771
756 rc = frwr_alloc_recovery_wq();
757 if (rc)
758 return rc;
759
760 rc = rpcrdma_alloc_wq(); 772 rc = rpcrdma_alloc_wq();
761 if (rc) { 773 if (rc)
762 frwr_destroy_recovery_wq();
763 return rc; 774 return rc;
764 }
765 775
766 rc = xprt_register_transport(&xprt_rdma); 776 rc = xprt_register_transport(&xprt_rdma);
767 if (rc) { 777 if (rc) {
768 rpcrdma_destroy_wq(); 778 rpcrdma_destroy_wq();
769 frwr_destroy_recovery_wq();
770 return rc; 779 return rc;
771 } 780 }
772 781
@@ -774,7 +783,6 @@ int xprt_rdma_init(void)
774 if (rc) { 783 if (rc) {
775 xprt_unregister_transport(&xprt_rdma); 784 xprt_unregister_transport(&xprt_rdma);
776 rpcrdma_destroy_wq(); 785 rpcrdma_destroy_wq();
777 frwr_destroy_recovery_wq();
778 return rc; 786 return rc;
779 } 787 }
780 788
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b044d98a1370..536d0be3f61b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
379 struct rpcrdma_ia *ia = &xprt->rx_ia; 379 struct rpcrdma_ia *ia = &xprt->rx_ia;
380 int rc; 380 int rc;
381 381
382 ia->ri_dma_mr = NULL;
383
384 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 382 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
385 if (IS_ERR(ia->ri_id)) { 383 if (IS_ERR(ia->ri_id)) {
386 rc = PTR_ERR(ia->ri_id); 384 rc = PTR_ERR(ia->ri_id);
@@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
391 ia->ri_pd = ib_alloc_pd(ia->ri_device); 389 ia->ri_pd = ib_alloc_pd(ia->ri_device);
392 if (IS_ERR(ia->ri_pd)) { 390 if (IS_ERR(ia->ri_pd)) {
393 rc = PTR_ERR(ia->ri_pd); 391 rc = PTR_ERR(ia->ri_pd);
394 dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 392 pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
395 __func__, rc);
396 goto out2; 393 goto out2;
397 } 394 }
398 395
399 if (memreg == RPCRDMA_FRMR) {
400 if (!(ia->ri_device->attrs.device_cap_flags &
401 IB_DEVICE_MEM_MGT_EXTENSIONS) ||
402 (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
403 dprintk("RPC: %s: FRMR registration "
404 "not supported by HCA\n", __func__);
405 memreg = RPCRDMA_MTHCAFMR;
406 }
407 }
408 if (memreg == RPCRDMA_MTHCAFMR) {
409 if (!ia->ri_device->alloc_fmr) {
410 dprintk("RPC: %s: MTHCAFMR registration "
411 "not supported by HCA\n", __func__);
412 rc = -EINVAL;
413 goto out3;
414 }
415 }
416
417 switch (memreg) { 396 switch (memreg) {
418 case RPCRDMA_FRMR: 397 case RPCRDMA_FRMR:
419 ia->ri_ops = &rpcrdma_frwr_memreg_ops; 398 if (frwr_is_supported(ia)) {
420 break; 399 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
421 case RPCRDMA_ALLPHYSICAL: 400 break;
422 ia->ri_ops = &rpcrdma_physical_memreg_ops; 401 }
423 break; 402 /*FALLTHROUGH*/
424 case RPCRDMA_MTHCAFMR: 403 case RPCRDMA_MTHCAFMR:
425 ia->ri_ops = &rpcrdma_fmr_memreg_ops; 404 if (fmr_is_supported(ia)) {
426 break; 405 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
406 break;
407 }
408 /*FALLTHROUGH*/
427 default: 409 default:
428 printk(KERN_ERR "RPC: Unsupported memory " 410 pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
429 "registration mode: %d\n", memreg); 411 memreg);
430 rc = -ENOMEM; 412 rc = -EINVAL;
431 goto out3; 413 goto out3;
432 } 414 }
433 dprintk("RPC: %s: memory registration strategy is '%s'\n",
434 __func__, ia->ri_ops->ro_displayname);
435 415
436 return 0; 416 return 0;
437 417
@@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
585out2: 565out2:
586 ib_free_cq(sendcq); 566 ib_free_cq(sendcq);
587out1: 567out1:
588 if (ia->ri_dma_mr)
589 ib_dereg_mr(ia->ri_dma_mr);
590 return rc; 568 return rc;
591} 569}
592 570
@@ -600,8 +578,6 @@ out1:
600void 578void
601rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 579rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
602{ 580{
603 int rc;
604
605 dprintk("RPC: %s: entering, connected is %d\n", 581 dprintk("RPC: %s: entering, connected is %d\n",
606 __func__, ep->rep_connected); 582 __func__, ep->rep_connected);
607 583
@@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
615 591
616 ib_free_cq(ep->rep_attr.recv_cq); 592 ib_free_cq(ep->rep_attr.recv_cq);
617 ib_free_cq(ep->rep_attr.send_cq); 593 ib_free_cq(ep->rep_attr.send_cq);
618
619 if (ia->ri_dma_mr) {
620 rc = ib_dereg_mr(ia->ri_dma_mr);
621 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
622 __func__, rc);
623 }
624} 594}
625 595
626/* 596/*
@@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
777 ib_drain_qp(ia->ri_id->qp); 747 ib_drain_qp(ia->ri_id->qp);
778} 748}
779 749
750static void
751rpcrdma_mr_recovery_worker(struct work_struct *work)
752{
753 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
754 rb_recovery_worker.work);
755 struct rpcrdma_mw *mw;
756
757 spin_lock(&buf->rb_recovery_lock);
758 while (!list_empty(&buf->rb_stale_mrs)) {
759 mw = list_first_entry(&buf->rb_stale_mrs,
760 struct rpcrdma_mw, mw_list);
761 list_del_init(&mw->mw_list);
762 spin_unlock(&buf->rb_recovery_lock);
763
764 dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
765 mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
766
767 spin_lock(&buf->rb_recovery_lock);
768 }
769 spin_unlock(&buf->rb_recovery_lock);
770}
771
772void
773rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
774{
775 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
776 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
777
778 spin_lock(&buf->rb_recovery_lock);
779 list_add(&mw->mw_list, &buf->rb_stale_mrs);
780 spin_unlock(&buf->rb_recovery_lock);
781
782 schedule_delayed_work(&buf->rb_recovery_worker, 0);
783}
784
785static void
786rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
787{
788 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
789 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
790 unsigned int count;
791 LIST_HEAD(free);
792 LIST_HEAD(all);
793
794 for (count = 0; count < 32; count++) {
795 struct rpcrdma_mw *mw;
796 int rc;
797
798 mw = kzalloc(sizeof(*mw), GFP_KERNEL);
799 if (!mw)
800 break;
801
802 rc = ia->ri_ops->ro_init_mr(ia, mw);
803 if (rc) {
804 kfree(mw);
805 break;
806 }
807
808 mw->mw_xprt = r_xprt;
809
810 list_add(&mw->mw_list, &free);
811 list_add(&mw->mw_all, &all);
812 }
813
814 spin_lock(&buf->rb_mwlock);
815 list_splice(&free, &buf->rb_mws);
816 list_splice(&all, &buf->rb_all);
817 r_xprt->rx_stats.mrs_allocated += count;
818 spin_unlock(&buf->rb_mwlock);
819
820 dprintk("RPC: %s: created %u MRs\n", __func__, count);
821}
822
823static void
824rpcrdma_mr_refresh_worker(struct work_struct *work)
825{
826 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
827 rb_refresh_worker.work);
828 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
829 rx_buf);
830
831 rpcrdma_create_mrs(r_xprt);
832}
833
780struct rpcrdma_req * 834struct rpcrdma_req *
781rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) 835rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
782{ 836{
@@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
793 spin_unlock(&buffer->rb_reqslock); 847 spin_unlock(&buffer->rb_reqslock);
794 req->rl_cqe.done = rpcrdma_wc_send; 848 req->rl_cqe.done = rpcrdma_wc_send;
795 req->rl_buffer = &r_xprt->rx_buf; 849 req->rl_buffer = &r_xprt->rx_buf;
850 INIT_LIST_HEAD(&req->rl_registered);
796 return req; 851 return req;
797} 852}
798 853
@@ -832,17 +887,23 @@ int
832rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 887rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
833{ 888{
834 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 889 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
835 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
836 int i, rc; 890 int i, rc;
837 891
838 buf->rb_max_requests = r_xprt->rx_data.max_requests; 892 buf->rb_max_requests = r_xprt->rx_data.max_requests;
839 buf->rb_bc_srv_max_requests = 0; 893 buf->rb_bc_srv_max_requests = 0;
840 spin_lock_init(&buf->rb_lock);
841 atomic_set(&buf->rb_credits, 1); 894 atomic_set(&buf->rb_credits, 1);
895 spin_lock_init(&buf->rb_mwlock);
896 spin_lock_init(&buf->rb_lock);
897 spin_lock_init(&buf->rb_recovery_lock);
898 INIT_LIST_HEAD(&buf->rb_mws);
899 INIT_LIST_HEAD(&buf->rb_all);
900 INIT_LIST_HEAD(&buf->rb_stale_mrs);
901 INIT_DELAYED_WORK(&buf->rb_refresh_worker,
902 rpcrdma_mr_refresh_worker);
903 INIT_DELAYED_WORK(&buf->rb_recovery_worker,
904 rpcrdma_mr_recovery_worker);
842 905
843 rc = ia->ri_ops->ro_init(r_xprt); 906 rpcrdma_create_mrs(r_xprt);
844 if (rc)
845 goto out;
846 907
847 INIT_LIST_HEAD(&buf->rb_send_bufs); 908 INIT_LIST_HEAD(&buf->rb_send_bufs);
848 INIT_LIST_HEAD(&buf->rb_allreqs); 909 INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
862 } 923 }
863 924
864 INIT_LIST_HEAD(&buf->rb_recv_bufs); 925 INIT_LIST_HEAD(&buf->rb_recv_bufs);
865 for (i = 0; i < buf->rb_max_requests + 2; i++) { 926 for (i = 0; i < buf->rb_max_requests; i++) {
866 struct rpcrdma_rep *rep; 927 struct rpcrdma_rep *rep;
867 928
868 rep = rpcrdma_create_rep(r_xprt); 929 rep = rpcrdma_create_rep(r_xprt);
@@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
918 kfree(req); 979 kfree(req);
919} 980}
920 981
982static void
983rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
984{
985 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
986 rx_buf);
987 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
988 struct rpcrdma_mw *mw;
989 unsigned int count;
990
991 count = 0;
992 spin_lock(&buf->rb_mwlock);
993 while (!list_empty(&buf->rb_all)) {
994 mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
995 list_del(&mw->mw_all);
996
997 spin_unlock(&buf->rb_mwlock);
998 ia->ri_ops->ro_release_mr(mw);
999 count++;
1000 spin_lock(&buf->rb_mwlock);
1001 }
1002 spin_unlock(&buf->rb_mwlock);
1003 r_xprt->rx_stats.mrs_allocated = 0;
1004
1005 dprintk("RPC: %s: released %u MRs\n", __func__, count);
1006}
1007
921void 1008void
922rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1009rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
923{ 1010{
924 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1011 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
925 1012
1013 cancel_delayed_work_sync(&buf->rb_recovery_worker);
1014
926 while (!list_empty(&buf->rb_recv_bufs)) { 1015 while (!list_empty(&buf->rb_recv_bufs)) {
927 struct rpcrdma_rep *rep; 1016 struct rpcrdma_rep *rep;
928 1017
@@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
944 } 1033 }
945 spin_unlock(&buf->rb_reqslock); 1034 spin_unlock(&buf->rb_reqslock);
946 1035
947 ia->ri_ops->ro_destroy(buf); 1036 rpcrdma_destroy_mrs(buf);
948} 1037}
949 1038
950struct rpcrdma_mw * 1039struct rpcrdma_mw *
@@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
962 spin_unlock(&buf->rb_mwlock); 1051 spin_unlock(&buf->rb_mwlock);
963 1052
964 if (!mw) 1053 if (!mw)
965 pr_err("RPC: %s: no MWs available\n", __func__); 1054 goto out_nomws;
966 return mw; 1055 return mw;
1056
1057out_nomws:
1058 dprintk("RPC: %s: no MWs available\n", __func__);
1059 schedule_delayed_work(&buf->rb_refresh_worker, 0);
1060
1061 /* Allow the reply handler and refresh worker to run */
1062 cond_resched();
1063
1064 return NULL;
967} 1065}
968 1066
969void 1067void
@@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
978 1076
979/* 1077/*
980 * Get a set of request/reply buffers. 1078 * Get a set of request/reply buffers.
981 *
982 * Reply buffer (if available) is attached to send buffer upon return.
983 */ 1079 */
984struct rpcrdma_req * 1080struct rpcrdma_req *
985rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1081rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
998 1094
999out_reqbuf: 1095out_reqbuf:
1000 spin_unlock(&buffers->rb_lock); 1096 spin_unlock(&buffers->rb_lock);
1001 pr_warn("RPC: %s: out of request buffers\n", __func__); 1097 pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
1002 return NULL; 1098 return NULL;
1003out_repbuf: 1099out_repbuf:
1100 list_add(&req->rl_free, &buffers->rb_send_bufs);
1004 spin_unlock(&buffers->rb_lock); 1101 spin_unlock(&buffers->rb_lock);
1005 pr_warn("RPC: %s: out of reply buffers\n", __func__); 1102 pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
1006 req->rl_reply = NULL; 1103 return NULL;
1007 return req;
1008} 1104}
1009 1105
1010/* 1106/*
@@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1060 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1156 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1061 */ 1157 */
1062 1158
1063void
1064rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
1065{
1066 dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
1067 seg->mr_offset,
1068 (unsigned long long)seg->mr_dma, seg->mr_dmalen);
1069}
1070
1071/** 1159/**
1072 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers 1160 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1073 * @ia: controlling rpcrdma_ia 1161 * @ia: controlling rpcrdma_ia
@@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1150 if (rep) { 1238 if (rep) {
1151 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1239 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1152 if (rc) 1240 if (rc)
1153 goto out; 1241 return rc;
1154 req->rl_reply = NULL; 1242 req->rl_reply = NULL;
1155 } 1243 }
1156 1244
@@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1175 1263
1176 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1264 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1177 if (rc) 1265 if (rc)
1178 dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 1266 goto out_postsend_err;
1179 rc); 1267 return 0;
1180out: 1268
1181 return rc; 1269out_postsend_err:
1270 pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
1271 return -ENOTCONN;
1182} 1272}
1183 1273
1184/* 1274/*
@@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1203 DMA_BIDIRECTIONAL); 1293 DMA_BIDIRECTIONAL);
1204 1294
1205 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1295 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1206
1207 if (rc) 1296 if (rc)
1208 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 1297 goto out_postrecv;
1209 rc); 1298 return 0;
1210 return rc; 1299
1300out_postrecv:
1301 pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
1302 return -ENOTCONN;
1211} 1303}
1212 1304
1213/** 1305/**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 95cdc66225ee..670fad57153a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -68,7 +68,6 @@ struct rpcrdma_ia {
68 struct ib_device *ri_device; 68 struct ib_device *ri_device;
69 struct rdma_cm_id *ri_id; 69 struct rdma_cm_id *ri_id;
70 struct ib_pd *ri_pd; 70 struct ib_pd *ri_pd;
71 struct ib_mr *ri_dma_mr;
72 struct completion ri_done; 71 struct completion ri_done;
73 int ri_async_rc; 72 int ri_async_rc;
74 unsigned int ri_max_frmr_depth; 73 unsigned int ri_max_frmr_depth;
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
172 * o recv buffer (posted to provider) 171 * o recv buffer (posted to provider)
173 * o ib_sge (also donated to provider) 172 * o ib_sge (also donated to provider)
174 * o status of reply (length, success or not) 173 * o status of reply (length, success or not)
175 * o bookkeeping state to get run by tasklet (list, etc) 174 * o bookkeeping state to get run by reply handler (list, etc)
176 * 175 *
177 * These are allocated during initialization, per-transport instance; 176 * These are allocated during initialization, per-transport instance.
178 * however, the tasklet execution list itself is global, as it should
179 * always be pretty short.
180 * 177 *
181 * N of these are associated with a transport instance, and stored in 178 * N of these are associated with a transport instance, and stored in
182 * struct rpcrdma_buffer. N is the max number of outstanding requests. 179 * struct rpcrdma_buffer. N is the max number of outstanding requests.
183 */ 180 */
184 181
185#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
186
187/* data segments + head/tail for Call + head/tail for Reply */
188#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
189
190struct rpcrdma_buffer;
191
192struct rpcrdma_rep { 182struct rpcrdma_rep {
193 struct ib_cqe rr_cqe; 183 struct ib_cqe rr_cqe;
194 unsigned int rr_len; 184 unsigned int rr_len;
@@ -221,9 +211,6 @@ enum rpcrdma_frmr_state {
221}; 211};
222 212
223struct rpcrdma_frmr { 213struct rpcrdma_frmr {
224 struct scatterlist *fr_sg;
225 int fr_nents;
226 enum dma_data_direction fr_dir;
227 struct ib_mr *fr_mr; 214 struct ib_mr *fr_mr;
228 struct ib_cqe fr_cqe; 215 struct ib_cqe fr_cqe;
229 enum rpcrdma_frmr_state fr_state; 216 enum rpcrdma_frmr_state fr_state;
@@ -235,18 +222,23 @@ struct rpcrdma_frmr {
235}; 222};
236 223
237struct rpcrdma_fmr { 224struct rpcrdma_fmr {
238 struct ib_fmr *fmr; 225 struct ib_fmr *fm_mr;
239 u64 *physaddrs; 226 u64 *fm_physaddrs;
240}; 227};
241 228
242struct rpcrdma_mw { 229struct rpcrdma_mw {
230 struct list_head mw_list;
231 struct scatterlist *mw_sg;
232 int mw_nents;
233 enum dma_data_direction mw_dir;
243 union { 234 union {
244 struct rpcrdma_fmr fmr; 235 struct rpcrdma_fmr fmr;
245 struct rpcrdma_frmr frmr; 236 struct rpcrdma_frmr frmr;
246 }; 237 };
247 struct work_struct mw_work;
248 struct rpcrdma_xprt *mw_xprt; 238 struct rpcrdma_xprt *mw_xprt;
249 struct list_head mw_list; 239 u32 mw_handle;
240 u32 mw_length;
241 u64 mw_offset;
250 struct list_head mw_all; 242 struct list_head mw_all;
251}; 243};
252 244
@@ -266,33 +258,30 @@ struct rpcrdma_mw {
266 * of iovs for send operations. The reason is that the iovs passed to 258 * of iovs for send operations. The reason is that the iovs passed to
267 * ib_post_{send,recv} must not be modified until the work request 259 * ib_post_{send,recv} must not be modified until the work request
268 * completes. 260 * completes.
269 *
270 * NOTES:
271 * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
272 * marshal. The number needed varies depending on the iov lists that
273 * are passed to us, the memory registration mode we are in, and if
274 * physical addressing is used, the layout.
275 */ 261 */
276 262
263/* Maximum number of page-sized "segments" per chunk list to be
264 * registered or invalidated. Must handle a Reply chunk:
265 */
266enum {
267 RPCRDMA_MAX_IOV_SEGS = 3,
268 RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
269 RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS +
270 RPCRDMA_MAX_IOV_SEGS,
271};
272
277struct rpcrdma_mr_seg { /* chunk descriptors */ 273struct rpcrdma_mr_seg { /* chunk descriptors */
278 struct rpcrdma_mw *rl_mw; /* registered MR */
279 u64 mr_base; /* registration result */
280 u32 mr_rkey; /* registration result */
281 u32 mr_len; /* length of chunk or segment */ 274 u32 mr_len; /* length of chunk or segment */
282 int mr_nsegs; /* number of segments in chunk or 0 */
283 enum dma_data_direction mr_dir; /* segment mapping direction */
284 dma_addr_t mr_dma; /* segment mapping address */
285 size_t mr_dmalen; /* segment mapping length */
286 struct page *mr_page; /* owning page, if any */ 275 struct page *mr_page; /* owning page, if any */
287 char *mr_offset; /* kva if no page, else offset */ 276 char *mr_offset; /* kva if no page, else offset */
288}; 277};
289 278
290#define RPCRDMA_MAX_IOVS (2) 279#define RPCRDMA_MAX_IOVS (2)
291 280
281struct rpcrdma_buffer;
292struct rpcrdma_req { 282struct rpcrdma_req {
293 struct list_head rl_free; 283 struct list_head rl_free;
294 unsigned int rl_niovs; 284 unsigned int rl_niovs;
295 unsigned int rl_nchunks;
296 unsigned int rl_connect_cookie; 285 unsigned int rl_connect_cookie;
297 struct rpc_task *rl_task; 286 struct rpc_task *rl_task;
298 struct rpcrdma_buffer *rl_buffer; 287 struct rpcrdma_buffer *rl_buffer;
@@ -300,12 +289,13 @@ struct rpcrdma_req {
300 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; 289 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
301 struct rpcrdma_regbuf *rl_rdmabuf; 290 struct rpcrdma_regbuf *rl_rdmabuf;
302 struct rpcrdma_regbuf *rl_sendbuf; 291 struct rpcrdma_regbuf *rl_sendbuf;
303 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
304 struct rpcrdma_mr_seg *rl_nextseg;
305 292
306 struct ib_cqe rl_cqe; 293 struct ib_cqe rl_cqe;
307 struct list_head rl_all; 294 struct list_head rl_all;
308 bool rl_backchannel; 295 bool rl_backchannel;
296
297 struct list_head rl_registered; /* registered segments */
298 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
309}; 299};
310 300
311static inline struct rpcrdma_req * 301static inline struct rpcrdma_req *
@@ -341,6 +331,11 @@ struct rpcrdma_buffer {
341 struct list_head rb_allreqs; 331 struct list_head rb_allreqs;
342 332
343 u32 rb_bc_max_requests; 333 u32 rb_bc_max_requests;
334
335 spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
336 struct list_head rb_stale_mrs;
337 struct delayed_work rb_recovery_worker;
338 struct delayed_work rb_refresh_worker;
344}; 339};
345#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) 340#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
346 341
@@ -387,6 +382,9 @@ struct rpcrdma_stats {
387 unsigned long bad_reply_count; 382 unsigned long bad_reply_count;
388 unsigned long nomsg_call_count; 383 unsigned long nomsg_call_count;
389 unsigned long bcall_count; 384 unsigned long bcall_count;
385 unsigned long mrs_recovered;
386 unsigned long mrs_orphaned;
387 unsigned long mrs_allocated;
390}; 388};
391 389
392/* 390/*
@@ -395,23 +393,25 @@ struct rpcrdma_stats {
395struct rpcrdma_xprt; 393struct rpcrdma_xprt;
396struct rpcrdma_memreg_ops { 394struct rpcrdma_memreg_ops {
397 int (*ro_map)(struct rpcrdma_xprt *, 395 int (*ro_map)(struct rpcrdma_xprt *,
398 struct rpcrdma_mr_seg *, int, bool); 396 struct rpcrdma_mr_seg *, int, bool,
397 struct rpcrdma_mw **);
399 void (*ro_unmap_sync)(struct rpcrdma_xprt *, 398 void (*ro_unmap_sync)(struct rpcrdma_xprt *,
400 struct rpcrdma_req *); 399 struct rpcrdma_req *);
401 void (*ro_unmap_safe)(struct rpcrdma_xprt *, 400 void (*ro_unmap_safe)(struct rpcrdma_xprt *,
402 struct rpcrdma_req *, bool); 401 struct rpcrdma_req *, bool);
402 void (*ro_recover_mr)(struct rpcrdma_mw *);
403 int (*ro_open)(struct rpcrdma_ia *, 403 int (*ro_open)(struct rpcrdma_ia *,
404 struct rpcrdma_ep *, 404 struct rpcrdma_ep *,
405 struct rpcrdma_create_data_internal *); 405 struct rpcrdma_create_data_internal *);
406 size_t (*ro_maxpages)(struct rpcrdma_xprt *); 406 size_t (*ro_maxpages)(struct rpcrdma_xprt *);
407 int (*ro_init)(struct rpcrdma_xprt *); 407 int (*ro_init_mr)(struct rpcrdma_ia *,
408 void (*ro_destroy)(struct rpcrdma_buffer *); 408 struct rpcrdma_mw *);
409 void (*ro_release_mr)(struct rpcrdma_mw *);
409 const char *ro_displayname; 410 const char *ro_displayname;
410}; 411};
411 412
412extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; 413extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
413extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; 414extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
414extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
415 415
416/* 416/*
417 * RPCRDMA transport -- encapsulates the structures above for 417 * RPCRDMA transport -- encapsulates the structures above for
@@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize;
446 */ 446 */
447int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); 447int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
448void rpcrdma_ia_close(struct rpcrdma_ia *); 448void rpcrdma_ia_close(struct rpcrdma_ia *);
449bool frwr_is_supported(struct rpcrdma_ia *);
450bool fmr_is_supported(struct rpcrdma_ia *);
449 451
450/* 452/*
451 * Endpoint calls - xprtrdma/verbs.c 453 * Endpoint calls - xprtrdma/verbs.c
@@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
477void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 479void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
478void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 480void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
479 481
482void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
483
480struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 484struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
481 size_t, gfp_t); 485 size_t, gfp_t);
482void rpcrdma_free_regbuf(struct rpcrdma_ia *, 486void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
484 488
485int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); 489int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
486 490
487int frwr_alloc_recovery_wq(void);
488void frwr_destroy_recovery_wq(void);
489
490int rpcrdma_alloc_wq(void); 491int rpcrdma_alloc_wq(void);
491void rpcrdma_destroy_wq(void); 492void rpcrdma_destroy_wq(void);
492 493
@@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void);
494 * Wrappers for chunk registration, shared by read/write chunk code. 495 * Wrappers for chunk registration, shared by read/write chunk code.
495 */ 496 */
496 497
497void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
498
499static inline enum dma_data_direction 498static inline enum dma_data_direction
500rpcrdma_data_dir(bool writing) 499rpcrdma_data_dir(bool writing)
501{ 500{
502 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 501 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
503} 502}
504 503
505static inline void
506rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
507 enum dma_data_direction direction)
508{
509 seg->mr_dir = direction;
510 seg->mr_dmalen = seg->mr_len;
511
512 if (seg->mr_page)
513 seg->mr_dma = ib_dma_map_page(device,
514 seg->mr_page, offset_in_page(seg->mr_offset),
515 seg->mr_dmalen, seg->mr_dir);
516 else
517 seg->mr_dma = ib_dma_map_single(device,
518 seg->mr_offset,
519 seg->mr_dmalen, seg->mr_dir);
520
521 if (ib_dma_mapping_error(device, seg->mr_dma))
522 rpcrdma_mapping_error(seg);
523}
524
525static inline void
526rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
527{
528 if (seg->mr_page)
529 ib_dma_unmap_page(device,
530 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
531 else
532 ib_dma_unmap_single(device,
533 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
534}
535
536/* 504/*
537 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 505 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
538 */ 506 */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7e2b2fa189c3..111767ab124a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = {
124 .mode = 0644, 124 .mode = 0644,
125 .proc_handler = proc_dointvec_minmax, 125 .proc_handler = proc_dointvec_minmax,
126 .extra1 = &xprt_min_resvport_limit, 126 .extra1 = &xprt_min_resvport_limit,
127 .extra2 = &xprt_max_resvport_limit 127 .extra2 = &xprt_max_resvport
128 }, 128 },
129 { 129 {
130 .procname = "max_resvport", 130 .procname = "max_resvport",
@@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = {
132 .maxlen = sizeof(unsigned int), 132 .maxlen = sizeof(unsigned int),
133 .mode = 0644, 133 .mode = 0644,
134 .proc_handler = proc_dointvec_minmax, 134 .proc_handler = proc_dointvec_minmax,
135 .extra1 = &xprt_min_resvport_limit, 135 .extra1 = &xprt_min_resvport,
136 .extra2 = &xprt_max_resvport_limit 136 .extra2 = &xprt_max_resvport_limit
137 }, 137 },
138 { 138 {
@@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
642 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 642 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
643 struct xdr_buf *xdr = &req->rq_snd_buf; 643 struct xdr_buf *xdr = &req->rq_snd_buf;
644 bool zerocopy = true; 644 bool zerocopy = true;
645 bool vm_wait = false;
645 int status; 646 int status;
646 int sent; 647 int sent;
647 648
@@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
677 return 0; 678 return 0;
678 } 679 }
679 680
681 WARN_ON_ONCE(sent == 0 && status == 0);
682
683 if (status == -EAGAIN ) {
684 /*
685 * Return EAGAIN if we're sure we're hitting the
686 * socket send buffer limits.
687 */
688 if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
689 break;
690 /*
691 * Did we hit a memory allocation failure?
692 */
693 if (sent == 0) {
694 status = -ENOBUFS;
695 if (vm_wait)
696 break;
697 /* Retry, knowing now that we're below the
698 * socket send buffer limit
699 */
700 vm_wait = true;
701 }
702 continue;
703 }
680 if (status < 0) 704 if (status < 0)
681 break; 705 break;
682 if (sent == 0) { 706 vm_wait = false;
683 status = -EAGAIN;
684 break;
685 }
686 } 707 }
687 if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
688 status = -ENOBUFS;
689 708
690 switch (status) { 709 switch (status) {
691 case -ENOTSOCK: 710 case -ENOTSOCK:
@@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
755 sk->sk_error_report = transport->old_error_report; 774 sk->sk_error_report = transport->old_error_report;
756} 775}
757 776
777static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
778{
779 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
780
781 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
782}
783
758static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) 784static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
759{ 785{
760 smp_mb__before_atomic(); 786 smp_mb__before_atomic();
761 clear_bit(XPRT_CLOSE_WAIT, &xprt->state); 787 clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
762 clear_bit(XPRT_CLOSING, &xprt->state); 788 clear_bit(XPRT_CLOSING, &xprt->state);
789 xs_sock_reset_state_flags(xprt);
763 smp_mb__after_atomic(); 790 smp_mb__after_atomic();
764} 791}
765 792
@@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport)
962 goto out; 989 goto out;
963 for (;;) { 990 for (;;) {
964 skb = skb_recv_datagram(sk, 0, 1, &err); 991 skb = skb_recv_datagram(sk, 0, 1, &err);
965 if (skb == NULL) 992 if (skb != NULL) {
993 xs_local_data_read_skb(&transport->xprt, sk, skb);
994 skb_free_datagram(sk, skb);
995 continue;
996 }
997 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
966 break; 998 break;
967 xs_local_data_read_skb(&transport->xprt, sk, skb);
968 skb_free_datagram(sk, skb);
969 } 999 }
970out: 1000out:
971 mutex_unlock(&transport->recv_mutex); 1001 mutex_unlock(&transport->recv_mutex);
@@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
1043 goto out; 1073 goto out;
1044 for (;;) { 1074 for (;;) {
1045 skb = skb_recv_datagram(sk, 0, 1, &err); 1075 skb = skb_recv_datagram(sk, 0, 1, &err);
1046 if (skb == NULL) 1076 if (skb != NULL) {
1077 xs_udp_data_read_skb(&transport->xprt, sk, skb);
1078 skb_free_datagram(sk, skb);
1079 continue;
1080 }
1081 if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1047 break; 1082 break;
1048 xs_udp_data_read_skb(&transport->xprt, sk, skb);
1049 skb_free_datagram(sk, skb);
1050 } 1083 }
1051out: 1084out:
1052 mutex_unlock(&transport->recv_mutex); 1085 mutex_unlock(&transport->recv_mutex);
@@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk)
1074 if (xprt != NULL) { 1107 if (xprt != NULL) {
1075 struct sock_xprt *transport = container_of(xprt, 1108 struct sock_xprt *transport = container_of(xprt,
1076 struct sock_xprt, xprt); 1109 struct sock_xprt, xprt);
1077 queue_work(rpciod_workqueue, &transport->recv_worker); 1110 transport->old_data_ready(sk);
1111 /* Any data means we had a useful conversation, so
1112 * then we don't need to delay the next reconnect
1113 */
1114 if (xprt->reestablish_timeout)
1115 xprt->reestablish_timeout = 0;
1116 if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1117 queue_work(xprtiod_workqueue, &transport->recv_worker);
1078 } 1118 }
1079 read_unlock_bh(&sk->sk_callback_lock); 1119 read_unlock_bh(&sk->sk_callback_lock);
1080} 1120}
@@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
1474 for (;;) { 1514 for (;;) {
1475 lock_sock(sk); 1515 lock_sock(sk);
1476 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); 1516 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
1477 release_sock(sk); 1517 if (read <= 0) {
1478 if (read <= 0) 1518 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
1479 break; 1519 release_sock(sk);
1480 total += read; 1520 if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
1521 break;
1522 } else {
1523 release_sock(sk);
1524 total += read;
1525 }
1481 rd_desc.count = 65536; 1526 rd_desc.count = 65536;
1482 } 1527 }
1483out: 1528out:
@@ -1493,34 +1538,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work)
1493} 1538}
1494 1539
1495/** 1540/**
1496 * xs_tcp_data_ready - "data ready" callback for TCP sockets
1497 * @sk: socket with data to read
1498 *
1499 */
1500static void xs_tcp_data_ready(struct sock *sk)
1501{
1502 struct sock_xprt *transport;
1503 struct rpc_xprt *xprt;
1504
1505 dprintk("RPC: xs_tcp_data_ready...\n");
1506
1507 read_lock_bh(&sk->sk_callback_lock);
1508 if (!(xprt = xprt_from_sock(sk)))
1509 goto out;
1510 transport = container_of(xprt, struct sock_xprt, xprt);
1511
1512 /* Any data means we had a useful conversation, so
1513 * the we don't need to delay the next reconnect
1514 */
1515 if (xprt->reestablish_timeout)
1516 xprt->reestablish_timeout = 0;
1517 queue_work(rpciod_workqueue, &transport->recv_worker);
1518
1519out:
1520 read_unlock_bh(&sk->sk_callback_lock);
1521}
1522
1523/**
1524 * xs_tcp_state_change - callback to handle TCP socket state changes 1541 * xs_tcp_state_change - callback to handle TCP socket state changes
1525 * @sk: socket whose state has changed 1542 * @sk: socket whose state has changed
1526 * 1543 *
@@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
1714 1731
1715static unsigned short xs_get_random_port(void) 1732static unsigned short xs_get_random_port(void)
1716{ 1733{
1717 unsigned short range = xprt_max_resvport - xprt_min_resvport; 1734 unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
1718 unsigned short rand = (unsigned short) prandom_u32() % range; 1735 unsigned short rand = (unsigned short) prandom_u32() % range;
1719 return rand + xprt_min_resvport; 1736 return rand + xprt_min_resvport;
1720} 1737}
@@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2241 xs_save_old_callbacks(transport, sk); 2258 xs_save_old_callbacks(transport, sk);
2242 2259
2243 sk->sk_user_data = xprt; 2260 sk->sk_user_data = xprt;
2244 sk->sk_data_ready = xs_tcp_data_ready; 2261 sk->sk_data_ready = xs_data_ready;
2245 sk->sk_state_change = xs_tcp_state_change; 2262 sk->sk_state_change = xs_tcp_state_change;
2246 sk->sk_write_space = xs_tcp_write_space; 2263 sk->sk_write_space = xs_tcp_write_space;
2247 sock_set_flag(sk, SOCK_FASYNC); 2264 sock_set_flag(sk, SOCK_FASYNC);
@@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
2380 /* Start by resetting any existing state */ 2397 /* Start by resetting any existing state */
2381 xs_reset_transport(transport); 2398 xs_reset_transport(transport);
2382 2399
2383 queue_delayed_work(rpciod_workqueue, 2400 queue_delayed_work(xprtiod_workqueue,
2384 &transport->connect_worker, 2401 &transport->connect_worker,
2385 xprt->reestablish_timeout); 2402 xprt->reestablish_timeout);
2386 xprt->reestablish_timeout <<= 1; 2403 xprt->reestablish_timeout <<= 1;
@@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
2390 xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; 2407 xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
2391 } else { 2408 } else {
2392 dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); 2409 dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
2393 queue_delayed_work(rpciod_workqueue, 2410 queue_delayed_work(xprtiod_workqueue,
2394 &transport->connect_worker, 0); 2411 &transport->connect_worker, 0);
2395 } 2412 }
2396} 2413}
@@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val,
3153 3170
3154static int param_set_portnr(const char *val, const struct kernel_param *kp) 3171static int param_set_portnr(const char *val, const struct kernel_param *kp)
3155{ 3172{
3156 return param_set_uint_minmax(val, kp, 3173 if (kp->arg == &xprt_min_resvport)
3174 return param_set_uint_minmax(val, kp,
3157 RPC_MIN_RESVPORT, 3175 RPC_MIN_RESVPORT,
3176 xprt_max_resvport);
3177 return param_set_uint_minmax(val, kp,
3178 xprt_min_resvport,
3158 RPC_MAX_RESVPORT); 3179 RPC_MAX_RESVPORT);
3159} 3180}
3160 3181