diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-30 19:33:25 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-30 19:33:25 -0400 |
commit | 7f155c702677d057d03b192ce652311de5434697 (patch) | |
tree | dcee0fbb463ec3e55cb50181180c7d175d5895c3 | |
parent | d761f3ed6e71bcca724a6e9e39efcac65b7b4ac1 (diff) | |
parent | 944171cbf499d3445c749f7c13c46de0a564a905 (diff) |
Merge tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust:
"Highlights include:
Stable bugfixes:
- nfs: don't create zero-length requests
- several LAYOUTGET bugfixes
Features:
- several performance related features
- more aggressive caching when we can rely on close-to-open
cache consistency
- remove serialisation of O_DIRECT reads and writes
- optimise several code paths to not flush to disk unnecessarily.
However allow for the idiosyncracies of pNFS for those layout
types that need to issue a LAYOUTCOMMIT before the metadata can
be updated on the server.
- SUNRPC updates to the client data receive path
- pNFS/SCSI support RH/Fedora dm-mpath device nodes
- pNFS files/flexfiles can now use unprivileged ports when
the generic NFS mount options allow it.
Bugfixes:
- Don't use RDMA direct data placement together with data
integrity or privacy security flavours
- Remove the RDMA ALLPHYSICAL memory registration mode as
it has potential security holes.
- Several layout recall fixes to improve NFSv4.1 protocol
compliance.
- Fix an Oops in the pNFS files and flexfiles connection
setup to the DS
- Allow retry of operations that used a returned delegation
stateid
- Don't mark the inode as revalidated if a LAYOUTCOMMIT is
outstanding
- Fix writeback races in nfs4_copy_range() and
nfs42_proc_deallocate()"
* tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (104 commits)
pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding
NFSv4: Clean up lookup of SECINFO_NO_NAME
NFSv4.2: Fix warning "variable ‘stateids’ set but not used"
NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’"
SUNRPC: Fix a compiler warning in fs/nfs/clnt.c
pNFS: Remove redundant smp_mb() from pnfs_init_lseg()
pNFS: Cleanup - do layout segment initialisation in one place
pNFS: Remove redundant stateid invalidation
pNFS: Remove redundant pnfs_mark_layout_returned_if_empty()
pNFS: Clear the layout metadata if the server changed the layout stateid
pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid()
NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id
pNFS: Do not set plh_return_seq for non-callback related layoutreturns
pNFS: Ensure layoutreturn acts as a completion for layout callbacks
pNFS: Fix CB_LAYOUTRECALL stateid verification
pNFS: Always update the layout barrier seqid on LAYOUTGET
pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set
pNFS: Clear the layout return tracking on layout reinitialisation
pNFS: LAYOUTRETURN should only update the stateid if the layout is valid
nfs: don't create zero-length requests
...
55 files changed, 1748 insertions, 1500 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417955a2..6abdda209642 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile | |||
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o | |||
6 | 6 | ||
7 | CFLAGS_nfstrace.o += -I$(src) | 7 | CFLAGS_nfstrace.o += -I$(src) |
8 | nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ | 8 | nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ |
9 | direct.o pagelist.o read.o symlink.o unlink.o \ | 9 | io.o direct.o pagelist.o read.o symlink.o unlink.o \ |
10 | write.o namespace.o mount_clnt.o nfstrace.o | 10 | write.o namespace.o mount_clnt.o nfstrace.o |
11 | nfs-$(CONFIG_ROOT_NFS) += nfsroot.o | 11 | nfs-$(CONFIG_ROOT_NFS) += nfsroot.o |
12 | nfs-$(CONFIG_SYSCTL) += sysctl.o | 12 | nfs-$(CONFIG_SYSCTL) += sysctl.o |
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e5b89675263e..a69ef4e9c24c 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c | |||
@@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) | |||
65 | if (!p) | 65 | if (!p) |
66 | return -EIO; | 66 | return -EIO; |
67 | b->simple.nr_sigs = be32_to_cpup(p++); | 67 | b->simple.nr_sigs = be32_to_cpup(p++); |
68 | if (!b->simple.nr_sigs) { | 68 | if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) { |
69 | dprintk("no signature\n"); | 69 | dprintk("Bad signature count: %d\n", b->simple.nr_sigs); |
70 | return -EIO; | 70 | return -EIO; |
71 | } | 71 | } |
72 | 72 | ||
@@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) | |||
89 | memcpy(&b->simple.sigs[i].sig, p, | 89 | memcpy(&b->simple.sigs[i].sig, p, |
90 | b->simple.sigs[i].sig_len); | 90 | b->simple.sigs[i].sig_len); |
91 | 91 | ||
92 | b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; | 92 | b->simple.len += 8 + 4 + \ |
93 | (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2); | ||
93 | } | 94 | } |
94 | break; | 95 | break; |
95 | case PNFS_BLOCK_VOLUME_SLICE: | 96 | case PNFS_BLOCK_VOLUME_SLICE: |
@@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) | |||
104 | p = xdr_inline_decode(xdr, 4); | 105 | p = xdr_inline_decode(xdr, 4); |
105 | if (!p) | 106 | if (!p) |
106 | return -EIO; | 107 | return -EIO; |
108 | |||
107 | b->concat.volumes_count = be32_to_cpup(p++); | 109 | b->concat.volumes_count = be32_to_cpup(p++); |
110 | if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) { | ||
111 | dprintk("Too many volumes: %d\n", b->concat.volumes_count); | ||
112 | return -EIO; | ||
113 | } | ||
108 | 114 | ||
109 | p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); | 115 | p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); |
110 | if (!p) | 116 | if (!p) |
@@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) | |||
116 | p = xdr_inline_decode(xdr, 8 + 4); | 122 | p = xdr_inline_decode(xdr, 8 + 4); |
117 | if (!p) | 123 | if (!p) |
118 | return -EIO; | 124 | return -EIO; |
125 | |||
119 | p = xdr_decode_hyper(p, &b->stripe.chunk_size); | 126 | p = xdr_decode_hyper(p, &b->stripe.chunk_size); |
120 | b->stripe.volumes_count = be32_to_cpup(p++); | 127 | b->stripe.volumes_count = be32_to_cpup(p++); |
128 | if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) { | ||
129 | dprintk("Too many volumes: %d\n", b->stripe.volumes_count); | ||
130 | return -EIO; | ||
131 | } | ||
121 | 132 | ||
122 | p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); | 133 | p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); |
123 | if (!p) | 134 | if (!p) |
@@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, | |||
224 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | 235 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) |
225 | { | 236 | { |
226 | struct pnfs_block_volume *v = &volumes[idx]; | 237 | struct pnfs_block_volume *v = &volumes[idx]; |
238 | struct block_device *bdev; | ||
227 | dev_t dev; | 239 | dev_t dev; |
228 | 240 | ||
229 | dev = bl_resolve_deviceid(server, v, gfp_mask); | 241 | dev = bl_resolve_deviceid(server, v, gfp_mask); |
230 | if (!dev) | 242 | if (!dev) |
231 | return -EIO; | 243 | return -EIO; |
232 | 244 | ||
233 | d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); | 245 | bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); |
234 | if (IS_ERR(d->bdev)) { | 246 | if (IS_ERR(bdev)) { |
235 | printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", | 247 | printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", |
236 | MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); | 248 | MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); |
237 | return PTR_ERR(d->bdev); | 249 | return PTR_ERR(bdev); |
238 | } | 250 | } |
251 | d->bdev = bdev; | ||
239 | 252 | ||
240 | 253 | ||
241 | d->len = i_size_read(d->bdev->bd_inode); | 254 | d->len = i_size_read(d->bdev->bd_inode); |
@@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v) | |||
287 | } | 300 | } |
288 | } | 301 | } |
289 | 302 | ||
303 | /* | ||
304 | * Try to open the udev path for the WWN. At least on Debian the udev | ||
305 | * by-id path will always point to the dm-multipath device if one exists. | ||
306 | */ | ||
307 | static struct block_device * | ||
308 | bl_open_udev_path(struct pnfs_block_volume *v) | ||
309 | { | ||
310 | struct block_device *bdev; | ||
311 | const char *devname; | ||
312 | |||
313 | devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", | ||
314 | v->scsi.designator_len, v->scsi.designator); | ||
315 | if (!devname) | ||
316 | return ERR_PTR(-ENOMEM); | ||
317 | |||
318 | bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); | ||
319 | if (IS_ERR(bdev)) { | ||
320 | pr_warn("pNFS: failed to open device %s (%ld)\n", | ||
321 | devname, PTR_ERR(bdev)); | ||
322 | } | ||
323 | |||
324 | kfree(devname); | ||
325 | return bdev; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the | ||
330 | * wwn- links will only point to the first discovered SCSI device there. | ||
331 | */ | ||
332 | static struct block_device * | ||
333 | bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) | ||
334 | { | ||
335 | struct block_device *bdev; | ||
336 | const char *devname; | ||
337 | |||
338 | devname = kasprintf(GFP_KERNEL, | ||
339 | "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", | ||
340 | v->scsi.designator_type, | ||
341 | v->scsi.designator_len, v->scsi.designator); | ||
342 | if (!devname) | ||
343 | return ERR_PTR(-ENOMEM); | ||
344 | |||
345 | bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); | ||
346 | kfree(devname); | ||
347 | return bdev; | ||
348 | } | ||
349 | |||
290 | static int | 350 | static int |
291 | bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, | 351 | bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, |
292 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | 352 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) |
293 | { | 353 | { |
294 | struct pnfs_block_volume *v = &volumes[idx]; | 354 | struct pnfs_block_volume *v = &volumes[idx]; |
355 | struct block_device *bdev; | ||
295 | const struct pr_ops *ops; | 356 | const struct pr_ops *ops; |
296 | const char *devname; | ||
297 | int error; | 357 | int error; |
298 | 358 | ||
299 | if (!bl_validate_designator(v)) | 359 | if (!bl_validate_designator(v)) |
300 | return -EINVAL; | 360 | return -EINVAL; |
301 | 361 | ||
302 | switch (v->scsi.designator_len) { | 362 | bdev = bl_open_dm_mpath_udev_path(v); |
303 | case 8: | 363 | if (IS_ERR(bdev)) |
304 | devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", | 364 | bdev = bl_open_udev_path(v); |
305 | v->scsi.designator); | 365 | if (IS_ERR(bdev)) |
306 | break; | 366 | return PTR_ERR(bdev); |
307 | case 12: | 367 | d->bdev = bdev; |
308 | devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN", | ||
309 | v->scsi.designator); | ||
310 | break; | ||
311 | case 16: | ||
312 | devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN", | ||
313 | v->scsi.designator); | ||
314 | break; | ||
315 | default: | ||
316 | return -EINVAL; | ||
317 | } | ||
318 | |||
319 | d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL); | ||
320 | if (IS_ERR(d->bdev)) { | ||
321 | pr_warn("pNFS: failed to open device %s (%ld)\n", | ||
322 | devname, PTR_ERR(d->bdev)); | ||
323 | kfree(devname); | ||
324 | return PTR_ERR(d->bdev); | ||
325 | } | ||
326 | |||
327 | kfree(devname); | ||
328 | 368 | ||
329 | d->len = i_size_read(d->bdev->bd_inode); | 369 | d->len = i_size_read(d->bdev->bd_inode); |
330 | d->map = bl_map_simple; | 370 | d->map = bl_map_simple; |
@@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, | |||
352 | return 0; | 392 | return 0; |
353 | 393 | ||
354 | out_blkdev_put: | 394 | out_blkdev_put: |
355 | blkdev_put(d->bdev, FMODE_READ); | 395 | blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); |
356 | return error; | 396 | return error; |
357 | } | 397 | } |
358 | 398 | ||
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 720b3ff55fa9..992bcb19c11e 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c | |||
@@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) | |||
121 | return be; | 121 | return be; |
122 | } | 122 | } |
123 | 123 | ||
124 | static void __ext_put_deviceids(struct list_head *head) | ||
125 | { | ||
126 | struct pnfs_block_extent *be, *tmp; | ||
127 | |||
128 | list_for_each_entry_safe(be, tmp, head, be_list) { | ||
129 | nfs4_put_deviceid_node(be->be_device); | ||
130 | kfree(be); | ||
131 | } | ||
132 | } | ||
133 | |||
124 | static void | 134 | static void |
125 | __ext_tree_insert(struct rb_root *root, | 135 | __ext_tree_insert(struct rb_root *root, |
126 | struct pnfs_block_extent *new, bool merge_ok) | 136 | struct pnfs_block_extent *new, bool merge_ok) |
@@ -163,7 +173,8 @@ free_new: | |||
163 | } | 173 | } |
164 | 174 | ||
165 | static int | 175 | static int |
166 | __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) | 176 | __ext_tree_remove(struct rb_root *root, |
177 | sector_t start, sector_t end, struct list_head *tmp) | ||
167 | { | 178 | { |
168 | struct pnfs_block_extent *be; | 179 | struct pnfs_block_extent *be; |
169 | sector_t len1 = 0, len2 = 0; | 180 | sector_t len1 = 0, len2 = 0; |
@@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) | |||
223 | struct pnfs_block_extent *next = ext_tree_next(be); | 234 | struct pnfs_block_extent *next = ext_tree_next(be); |
224 | 235 | ||
225 | rb_erase(&be->be_node, root); | 236 | rb_erase(&be->be_node, root); |
226 | nfs4_put_deviceid_node(be->be_device); | 237 | list_add_tail(&be->be_list, tmp); |
227 | kfree(be); | ||
228 | be = next; | 238 | be = next; |
229 | } | 239 | } |
230 | 240 | ||
@@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, | |||
350 | sector_t start, sector_t end) | 360 | sector_t start, sector_t end) |
351 | { | 361 | { |
352 | int err, err2; | 362 | int err, err2; |
363 | LIST_HEAD(tmp); | ||
353 | 364 | ||
354 | spin_lock(&bl->bl_ext_lock); | 365 | spin_lock(&bl->bl_ext_lock); |
355 | err = __ext_tree_remove(&bl->bl_ext_ro, start, end); | 366 | err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); |
356 | if (rw) { | 367 | if (rw) { |
357 | err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); | 368 | err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); |
358 | if (!err) | 369 | if (!err) |
359 | err = err2; | 370 | err = err2; |
360 | } | 371 | } |
361 | spin_unlock(&bl->bl_ext_lock); | 372 | spin_unlock(&bl->bl_ext_lock); |
362 | 373 | ||
374 | __ext_put_deviceids(&tmp); | ||
363 | return err; | 375 | return err; |
364 | } | 376 | } |
365 | 377 | ||
@@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, | |||
396 | sector_t end = start + len; | 408 | sector_t end = start + len; |
397 | struct pnfs_block_extent *be; | 409 | struct pnfs_block_extent *be; |
398 | int err = 0; | 410 | int err = 0; |
411 | LIST_HEAD(tmp); | ||
399 | 412 | ||
400 | spin_lock(&bl->bl_ext_lock); | 413 | spin_lock(&bl->bl_ext_lock); |
401 | /* | 414 | /* |
402 | * First remove all COW extents or holes from written to range. | 415 | * First remove all COW extents or holes from written to range. |
403 | */ | 416 | */ |
404 | err = __ext_tree_remove(&bl->bl_ext_ro, start, end); | 417 | err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); |
405 | if (err) | 418 | if (err) |
406 | goto out; | 419 | goto out; |
407 | 420 | ||
@@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, | |||
459 | } | 472 | } |
460 | out: | 473 | out: |
461 | spin_unlock(&bl->bl_ext_lock); | 474 | spin_unlock(&bl->bl_ext_lock); |
475 | |||
476 | __ext_put_deviceids(&tmp); | ||
462 | return err; | 477 | return err; |
463 | } | 478 | } |
464 | 479 | ||
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index aaa2e8d3df6f..c92a75e066a6 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c | |||
@@ -119,27 +119,30 @@ out: | |||
119 | * hashed by filehandle. | 119 | * hashed by filehandle. |
120 | */ | 120 | */ |
121 | static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, | 121 | static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, |
122 | struct nfs_fh *fh, nfs4_stateid *stateid) | 122 | struct nfs_fh *fh) |
123 | { | 123 | { |
124 | struct nfs_server *server; | 124 | struct nfs_server *server; |
125 | struct nfs_inode *nfsi; | ||
125 | struct inode *ino; | 126 | struct inode *ino; |
126 | struct pnfs_layout_hdr *lo; | 127 | struct pnfs_layout_hdr *lo; |
127 | 128 | ||
129 | restart: | ||
128 | list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { | 130 | list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { |
129 | list_for_each_entry(lo, &server->layouts, plh_layouts) { | 131 | list_for_each_entry(lo, &server->layouts, plh_layouts) { |
130 | if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) | 132 | nfsi = NFS_I(lo->plh_inode); |
133 | if (nfs_compare_fh(fh, &nfsi->fh)) | ||
131 | continue; | 134 | continue; |
132 | if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) | 135 | if (nfsi->layout != lo) |
133 | continue; | 136 | continue; |
134 | ino = igrab(lo->plh_inode); | 137 | ino = igrab(lo->plh_inode); |
135 | if (!ino) | 138 | if (!ino) |
136 | break; | 139 | break; |
137 | spin_lock(&ino->i_lock); | 140 | spin_lock(&ino->i_lock); |
138 | /* Is this layout in the process of being freed? */ | 141 | /* Is this layout in the process of being freed? */ |
139 | if (NFS_I(ino)->layout != lo) { | 142 | if (nfsi->layout != lo) { |
140 | spin_unlock(&ino->i_lock); | 143 | spin_unlock(&ino->i_lock); |
141 | iput(ino); | 144 | iput(ino); |
142 | break; | 145 | goto restart; |
143 | } | 146 | } |
144 | pnfs_get_layout_hdr(lo); | 147 | pnfs_get_layout_hdr(lo); |
145 | spin_unlock(&ino->i_lock); | 148 | spin_unlock(&ino->i_lock); |
@@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, | |||
151 | } | 154 | } |
152 | 155 | ||
153 | static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, | 156 | static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, |
154 | struct nfs_fh *fh, nfs4_stateid *stateid) | 157 | struct nfs_fh *fh) |
155 | { | 158 | { |
156 | struct pnfs_layout_hdr *lo; | 159 | struct pnfs_layout_hdr *lo; |
157 | 160 | ||
158 | spin_lock(&clp->cl_lock); | 161 | spin_lock(&clp->cl_lock); |
159 | rcu_read_lock(); | 162 | rcu_read_lock(); |
160 | lo = get_layout_by_fh_locked(clp, fh, stateid); | 163 | lo = get_layout_by_fh_locked(clp, fh); |
161 | rcu_read_unlock(); | 164 | rcu_read_unlock(); |
162 | spin_unlock(&clp->cl_lock); | 165 | spin_unlock(&clp->cl_lock); |
163 | 166 | ||
@@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, | |||
167 | /* | 170 | /* |
168 | * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) | 171 | * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) |
169 | */ | 172 | */ |
170 | static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, | 173 | static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, |
171 | const nfs4_stateid *new) | 174 | const nfs4_stateid *new) |
172 | { | 175 | { |
173 | u32 oldseq, newseq; | 176 | u32 oldseq, newseq; |
174 | 177 | ||
175 | oldseq = be32_to_cpu(lo->plh_stateid.seqid); | 178 | /* Is the stateid still not initialised? */ |
179 | if (!pnfs_layout_is_valid(lo)) | ||
180 | return NFS4ERR_DELAY; | ||
181 | |||
182 | /* Mismatched stateid? */ | ||
183 | if (!nfs4_stateid_match_other(&lo->plh_stateid, new)) | ||
184 | return NFS4ERR_BAD_STATEID; | ||
185 | |||
176 | newseq = be32_to_cpu(new->seqid); | 186 | newseq = be32_to_cpu(new->seqid); |
187 | /* Are we already in a layout recall situation? */ | ||
188 | if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && | ||
189 | lo->plh_return_seq != 0) { | ||
190 | if (newseq < lo->plh_return_seq) | ||
191 | return NFS4ERR_OLD_STATEID; | ||
192 | if (newseq > lo->plh_return_seq) | ||
193 | return NFS4ERR_DELAY; | ||
194 | goto out; | ||
195 | } | ||
177 | 196 | ||
197 | /* Check that the stateid matches what we think it should be. */ | ||
198 | oldseq = be32_to_cpu(lo->plh_stateid.seqid); | ||
178 | if (newseq > oldseq + 1) | 199 | if (newseq > oldseq + 1) |
179 | return false; | 200 | return NFS4ERR_DELAY; |
180 | return true; | 201 | /* Crazy server! */ |
202 | if (newseq <= oldseq) | ||
203 | return NFS4ERR_OLD_STATEID; | ||
204 | out: | ||
205 | return NFS_OK; | ||
181 | } | 206 | } |
182 | 207 | ||
183 | static u32 initiate_file_draining(struct nfs_client *clp, | 208 | static u32 initiate_file_draining(struct nfs_client *clp, |
@@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, | |||
188 | u32 rv = NFS4ERR_NOMATCHING_LAYOUT; | 213 | u32 rv = NFS4ERR_NOMATCHING_LAYOUT; |
189 | LIST_HEAD(free_me_list); | 214 | LIST_HEAD(free_me_list); |
190 | 215 | ||
191 | lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); | 216 | lo = get_layout_by_fh(clp, &args->cbl_fh); |
192 | if (!lo) { | 217 | if (!lo) { |
193 | trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, | 218 | trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, |
194 | &args->cbl_stateid, -rv); | 219 | &args->cbl_stateid, -rv); |
@@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, | |||
196 | } | 221 | } |
197 | 222 | ||
198 | ino = lo->plh_inode; | 223 | ino = lo->plh_inode; |
224 | pnfs_layoutcommit_inode(ino, false); | ||
225 | |||
199 | 226 | ||
200 | spin_lock(&ino->i_lock); | 227 | spin_lock(&ino->i_lock); |
201 | if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { | 228 | rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); |
202 | rv = NFS4ERR_DELAY; | 229 | if (rv != NFS_OK) |
203 | goto unlock; | 230 | goto unlock; |
204 | } | ||
205 | pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); | 231 | pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); |
206 | spin_unlock(&ino->i_lock); | ||
207 | |||
208 | pnfs_layoutcommit_inode(ino, false); | ||
209 | 232 | ||
210 | spin_lock(&ino->i_lock); | ||
211 | /* | 233 | /* |
212 | * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) | 234 | * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) |
213 | */ | 235 | */ |
@@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp, | |||
223 | goto unlock; | 245 | goto unlock; |
224 | } | 246 | } |
225 | 247 | ||
248 | /* Embrace your forgetfulness! */ | ||
249 | rv = NFS4ERR_NOMATCHING_LAYOUT; | ||
250 | |||
226 | if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { | 251 | if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { |
227 | NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, | 252 | NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, |
228 | &args->cbl_range); | 253 | &args->cbl_range); |
229 | } | 254 | } |
230 | pnfs_mark_layout_returned_if_empty(lo); | ||
231 | unlock: | 255 | unlock: |
232 | spin_unlock(&ino->i_lock); | 256 | spin_unlock(&ino->i_lock); |
233 | pnfs_free_lseg_list(&free_me_list); | 257 | pnfs_free_lseg_list(&free_me_list); |
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d81f96aacd51..656f68f7fe53 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c | |||
@@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r | |||
925 | if (hdr_arg.minorversion == 0) { | 925 | if (hdr_arg.minorversion == 0) { |
926 | cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); | 926 | cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); |
927 | if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) | 927 | if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) |
928 | return rpc_drop_reply; | 928 | goto out_invalidcred; |
929 | } | 929 | } |
930 | 930 | ||
931 | cps.minorversion = hdr_arg.minorversion; | 931 | cps.minorversion = hdr_arg.minorversion; |
@@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r | |||
953 | nfs_put_client(cps.clp); | 953 | nfs_put_client(cps.clp); |
954 | dprintk("%s: done, status = %u\n", __func__, ntohl(status)); | 954 | dprintk("%s: done, status = %u\n", __func__, ntohl(status)); |
955 | return rpc_success; | 955 | return rpc_success; |
956 | |||
957 | out_invalidcred: | ||
958 | pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); | ||
959 | return rpc_autherr_badcred; | ||
956 | } | 960 | } |
957 | 961 | ||
958 | /* | 962 | /* |
diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 487c5607d52f..003ebce4bbc4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c | |||
@@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, | |||
367 | */ | 367 | */ |
368 | struct nfs_client * | 368 | struct nfs_client * |
369 | nfs_get_client(const struct nfs_client_initdata *cl_init, | 369 | nfs_get_client(const struct nfs_client_initdata *cl_init, |
370 | const struct rpc_timeout *timeparms, | ||
371 | const char *ip_addr, | ||
372 | rpc_authflavor_t authflavour) | 370 | rpc_authflavor_t authflavour) |
373 | { | 371 | { |
374 | struct nfs_client *clp, *new = NULL; | 372 | struct nfs_client *clp, *new = NULL; |
@@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, | |||
399 | &nn->nfs_client_list); | 397 | &nn->nfs_client_list); |
400 | spin_unlock(&nn->nfs_client_lock); | 398 | spin_unlock(&nn->nfs_client_lock); |
401 | new->cl_flags = cl_init->init_flags; | 399 | new->cl_flags = cl_init->init_flags; |
402 | return rpc_ops->init_client(new, timeparms, ip_addr); | 400 | return rpc_ops->init_client(new, cl_init); |
403 | } | 401 | } |
404 | 402 | ||
405 | spin_unlock(&nn->nfs_client_lock); | 403 | spin_unlock(&nn->nfs_client_lock); |
@@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values); | |||
470 | * Create an RPC client handle | 468 | * Create an RPC client handle |
471 | */ | 469 | */ |
472 | int nfs_create_rpc_client(struct nfs_client *clp, | 470 | int nfs_create_rpc_client(struct nfs_client *clp, |
473 | const struct rpc_timeout *timeparms, | 471 | const struct nfs_client_initdata *cl_init, |
474 | rpc_authflavor_t flavor) | 472 | rpc_authflavor_t flavor) |
475 | { | 473 | { |
476 | struct rpc_clnt *clnt = NULL; | 474 | struct rpc_clnt *clnt = NULL; |
@@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp, | |||
479 | .protocol = clp->cl_proto, | 477 | .protocol = clp->cl_proto, |
480 | .address = (struct sockaddr *)&clp->cl_addr, | 478 | .address = (struct sockaddr *)&clp->cl_addr, |
481 | .addrsize = clp->cl_addrlen, | 479 | .addrsize = clp->cl_addrlen, |
482 | .timeout = timeparms, | 480 | .timeout = cl_init->timeparms, |
483 | .servername = clp->cl_hostname, | 481 | .servername = clp->cl_hostname, |
482 | .nodename = cl_init->nodename, | ||
484 | .program = &nfs_program, | 483 | .program = &nfs_program, |
485 | .version = clp->rpc_ops->version, | 484 | .version = clp->rpc_ops->version, |
486 | .authflavor = flavor, | 485 | .authflavor = flavor, |
@@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); | |||
591 | * nfs_init_client - Initialise an NFS2 or NFS3 client | 590 | * nfs_init_client - Initialise an NFS2 or NFS3 client |
592 | * | 591 | * |
593 | * @clp: nfs_client to initialise | 592 | * @clp: nfs_client to initialise |
594 | * @timeparms: timeout parameters for underlying RPC transport | 593 | * @cl_init: Initialisation parameters |
595 | * @ip_addr: IP presentation address (not used) | ||
596 | * | 594 | * |
597 | * Returns pointer to an NFS client, or an ERR_PTR value. | 595 | * Returns pointer to an NFS client, or an ERR_PTR value. |
598 | */ | 596 | */ |
599 | struct nfs_client *nfs_init_client(struct nfs_client *clp, | 597 | struct nfs_client *nfs_init_client(struct nfs_client *clp, |
600 | const struct rpc_timeout *timeparms, | 598 | const struct nfs_client_initdata *cl_init) |
601 | const char *ip_addr) | ||
602 | { | 599 | { |
603 | int error; | 600 | int error; |
604 | 601 | ||
@@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, | |||
612 | * Create a client RPC handle for doing FSSTAT with UNIX auth only | 609 | * Create a client RPC handle for doing FSSTAT with UNIX auth only |
613 | * - RFC 2623, sec 2.3.2 | 610 | * - RFC 2623, sec 2.3.2 |
614 | */ | 611 | */ |
615 | error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); | 612 | error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); |
616 | if (error < 0) | 613 | if (error < 0) |
617 | goto error; | 614 | goto error; |
618 | nfs_mark_client_ready(clp, NFS_CS_READY); | 615 | nfs_mark_client_ready(clp, NFS_CS_READY); |
@@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server, | |||
633 | const struct nfs_parsed_mount_data *data, | 630 | const struct nfs_parsed_mount_data *data, |
634 | struct nfs_subversion *nfs_mod) | 631 | struct nfs_subversion *nfs_mod) |
635 | { | 632 | { |
633 | struct rpc_timeout timeparms; | ||
636 | struct nfs_client_initdata cl_init = { | 634 | struct nfs_client_initdata cl_init = { |
637 | .hostname = data->nfs_server.hostname, | 635 | .hostname = data->nfs_server.hostname, |
638 | .addr = (const struct sockaddr *)&data->nfs_server.address, | 636 | .addr = (const struct sockaddr *)&data->nfs_server.address, |
@@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server, | |||
640 | .nfs_mod = nfs_mod, | 638 | .nfs_mod = nfs_mod, |
641 | .proto = data->nfs_server.protocol, | 639 | .proto = data->nfs_server.protocol, |
642 | .net = data->net, | 640 | .net = data->net, |
641 | .timeparms = &timeparms, | ||
643 | }; | 642 | }; |
644 | struct rpc_timeout timeparms; | ||
645 | struct nfs_client *clp; | 643 | struct nfs_client *clp; |
646 | int error; | 644 | int error; |
647 | 645 | ||
@@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server, | |||
653 | set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); | 651 | set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); |
654 | 652 | ||
655 | /* Allocate or find a client reference we can use */ | 653 | /* Allocate or find a client reference we can use */ |
656 | clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); | 654 | clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); |
657 | if (IS_ERR(clp)) { | 655 | if (IS_ERR(clp)) { |
658 | dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); | 656 | dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); |
659 | return PTR_ERR(clp); | 657 | return PTR_ERR(clp); |
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index baaa38859899..177fefb26c18 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c | |||
@@ -2252,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st | |||
2252 | return NULL; | 2252 | return NULL; |
2253 | } | 2253 | } |
2254 | 2254 | ||
2255 | static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) | 2255 | static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) |
2256 | { | 2256 | { |
2257 | struct nfs_inode *nfsi = NFS_I(inode); | 2257 | struct nfs_inode *nfsi = NFS_I(inode); |
2258 | struct nfs_access_entry *cache; | 2258 | struct nfs_access_entry *cache; |
2259 | int err = -ENOENT; | 2259 | bool retry = true; |
2260 | int err; | ||
2260 | 2261 | ||
2261 | spin_lock(&inode->i_lock); | 2262 | spin_lock(&inode->i_lock); |
2262 | if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) | 2263 | for(;;) { |
2263 | goto out_zap; | 2264 | if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) |
2264 | cache = nfs_access_search_rbtree(inode, cred); | 2265 | goto out_zap; |
2265 | if (cache == NULL) | 2266 | cache = nfs_access_search_rbtree(inode, cred); |
2266 | goto out; | 2267 | err = -ENOENT; |
2267 | if (!nfs_have_delegated_attributes(inode) && | 2268 | if (cache == NULL) |
2268 | !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) | 2269 | goto out; |
2269 | goto out_stale; | 2270 | /* Found an entry, is our attribute cache valid? */ |
2271 | if (!nfs_attribute_cache_expired(inode) && | ||
2272 | !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) | ||
2273 | break; | ||
2274 | err = -ECHILD; | ||
2275 | if (!may_block) | ||
2276 | goto out; | ||
2277 | if (!retry) | ||
2278 | goto out_zap; | ||
2279 | spin_unlock(&inode->i_lock); | ||
2280 | err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); | ||
2281 | if (err) | ||
2282 | return err; | ||
2283 | spin_lock(&inode->i_lock); | ||
2284 | retry = false; | ||
2285 | } | ||
2270 | res->jiffies = cache->jiffies; | 2286 | res->jiffies = cache->jiffies; |
2271 | res->cred = cache->cred; | 2287 | res->cred = cache->cred; |
2272 | res->mask = cache->mask; | 2288 | res->mask = cache->mask; |
@@ -2275,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str | |||
2275 | out: | 2291 | out: |
2276 | spin_unlock(&inode->i_lock); | 2292 | spin_unlock(&inode->i_lock); |
2277 | return err; | 2293 | return err; |
2278 | out_stale: | ||
2279 | rb_erase(&cache->rb_node, &nfsi->access_cache); | ||
2280 | list_del(&cache->lru); | ||
2281 | spin_unlock(&inode->i_lock); | ||
2282 | nfs_access_free_entry(cache); | ||
2283 | return -ENOENT; | ||
2284 | out_zap: | 2294 | out_zap: |
2285 | spin_unlock(&inode->i_lock); | 2295 | spin_unlock(&inode->i_lock); |
2286 | nfs_access_zap_cache(inode); | 2296 | nfs_access_zap_cache(inode); |
@@ -2307,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, | |||
2307 | cache = NULL; | 2317 | cache = NULL; |
2308 | if (cache == NULL) | 2318 | if (cache == NULL) |
2309 | goto out; | 2319 | goto out; |
2310 | if (!nfs_have_delegated_attributes(inode) && | 2320 | err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); |
2311 | !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) | 2321 | if (err) |
2312 | goto out; | 2322 | goto out; |
2313 | res->jiffies = cache->jiffies; | 2323 | res->jiffies = cache->jiffies; |
2314 | res->cred = cache->cred; | 2324 | res->cred = cache->cred; |
2315 | res->mask = cache->mask; | 2325 | res->mask = cache->mask; |
2316 | err = 0; | ||
2317 | out: | 2326 | out: |
2318 | rcu_read_unlock(); | 2327 | rcu_read_unlock(); |
2319 | return err; | 2328 | return err; |
@@ -2402,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask); | |||
2402 | static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) | 2411 | static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) |
2403 | { | 2412 | { |
2404 | struct nfs_access_entry cache; | 2413 | struct nfs_access_entry cache; |
2414 | bool may_block = (mask & MAY_NOT_BLOCK) == 0; | ||
2405 | int status; | 2415 | int status; |
2406 | 2416 | ||
2407 | trace_nfs_access_enter(inode); | 2417 | trace_nfs_access_enter(inode); |
2408 | 2418 | ||
2409 | status = nfs_access_get_cached_rcu(inode, cred, &cache); | 2419 | status = nfs_access_get_cached_rcu(inode, cred, &cache); |
2410 | if (status != 0) | 2420 | if (status != 0) |
2411 | status = nfs_access_get_cached(inode, cred, &cache); | 2421 | status = nfs_access_get_cached(inode, cred, &cache, may_block); |
2412 | if (status == 0) | 2422 | if (status == 0) |
2413 | goto out_cached; | 2423 | goto out_cached; |
2414 | 2424 | ||
2415 | status = -ECHILD; | 2425 | status = -ECHILD; |
2416 | if (mask & MAY_NOT_BLOCK) | 2426 | if (!may_block) |
2417 | goto out; | 2427 | goto out; |
2418 | 2428 | ||
2419 | /* Be clever: ask server to check for all possible rights */ | 2429 | /* Be clever: ask server to check for all possible rights */ |
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e6210ead71d0..72b7d13ee3c6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c | |||
@@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, | |||
196 | WARN_ON_ONCE(verfp->committed < 0); | 196 | WARN_ON_ONCE(verfp->committed < 0); |
197 | } | 197 | } |
198 | 198 | ||
199 | static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, | ||
200 | const struct nfs_writeverf *v2) | ||
201 | { | ||
202 | return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); | ||
203 | } | ||
204 | |||
199 | /* | 205 | /* |
200 | * nfs_direct_cmp_hdr_verf - compare verifier for pgio header | 206 | * nfs_direct_cmp_hdr_verf - compare verifier for pgio header |
201 | * @dreq - direct request possibly spanning multiple servers | 207 | * @dreq - direct request possibly spanning multiple servers |
@@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, | |||
215 | nfs_direct_set_hdr_verf(dreq, hdr); | 221 | nfs_direct_set_hdr_verf(dreq, hdr); |
216 | return 0; | 222 | return 0; |
217 | } | 223 | } |
218 | return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); | 224 | return nfs_direct_cmp_verf(verfp, &hdr->verf); |
219 | } | 225 | } |
220 | 226 | ||
221 | /* | 227 | /* |
@@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, | |||
238 | if (verfp->committed < 0) | 244 | if (verfp->committed < 0) |
239 | return 1; | 245 | return 1; |
240 | 246 | ||
241 | return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); | 247 | return nfs_direct_cmp_verf(verfp, &data->verf); |
242 | } | 248 | } |
243 | 249 | ||
244 | /** | 250 | /** |
@@ -366,22 +372,10 @@ out: | |||
366 | * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust | 372 | * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust |
367 | * the iocb is still valid here if this is a synchronous request. | 373 | * the iocb is still valid here if this is a synchronous request. |
368 | */ | 374 | */ |
369 | static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) | 375 | static void nfs_direct_complete(struct nfs_direct_req *dreq) |
370 | { | 376 | { |
371 | struct inode *inode = dreq->inode; | 377 | struct inode *inode = dreq->inode; |
372 | 378 | ||
373 | if (dreq->iocb && write) { | ||
374 | loff_t pos = dreq->iocb->ki_pos + dreq->count; | ||
375 | |||
376 | spin_lock(&inode->i_lock); | ||
377 | if (i_size_read(inode) < pos) | ||
378 | i_size_write(inode, pos); | ||
379 | spin_unlock(&inode->i_lock); | ||
380 | } | ||
381 | |||
382 | if (write) | ||
383 | nfs_zap_mapping(inode, inode->i_mapping); | ||
384 | |||
385 | inode_dio_end(inode); | 379 | inode_dio_end(inode); |
386 | 380 | ||
387 | if (dreq->iocb) { | 381 | if (dreq->iocb) { |
@@ -436,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) | |||
436 | } | 430 | } |
437 | out_put: | 431 | out_put: |
438 | if (put_dreq(dreq)) | 432 | if (put_dreq(dreq)) |
439 | nfs_direct_complete(dreq, false); | 433 | nfs_direct_complete(dreq); |
440 | hdr->release(hdr); | 434 | hdr->release(hdr); |
441 | } | 435 | } |
442 | 436 | ||
@@ -542,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, | |||
542 | } | 536 | } |
543 | 537 | ||
544 | if (put_dreq(dreq)) | 538 | if (put_dreq(dreq)) |
545 | nfs_direct_complete(dreq, false); | 539 | nfs_direct_complete(dreq); |
546 | return 0; | 540 | return 0; |
547 | } | 541 | } |
548 | 542 | ||
@@ -583,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) | |||
583 | if (!count) | 577 | if (!count) |
584 | goto out; | 578 | goto out; |
585 | 579 | ||
586 | inode_lock(inode); | ||
587 | result = nfs_sync_mapping(mapping); | ||
588 | if (result) | ||
589 | goto out_unlock; | ||
590 | |||
591 | task_io_account_read(count); | 580 | task_io_account_read(count); |
592 | 581 | ||
593 | result = -ENOMEM; | 582 | result = -ENOMEM; |
594 | dreq = nfs_direct_req_alloc(); | 583 | dreq = nfs_direct_req_alloc(); |
595 | if (dreq == NULL) | 584 | if (dreq == NULL) |
596 | goto out_unlock; | 585 | goto out; |
597 | 586 | ||
598 | dreq->inode = inode; | 587 | dreq->inode = inode; |
599 | dreq->bytes_left = dreq->max_count = count; | 588 | dreq->bytes_left = dreq->max_count = count; |
@@ -608,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) | |||
608 | if (!is_sync_kiocb(iocb)) | 597 | if (!is_sync_kiocb(iocb)) |
609 | dreq->iocb = iocb; | 598 | dreq->iocb = iocb; |
610 | 599 | ||
600 | nfs_start_io_direct(inode); | ||
601 | |||
611 | NFS_I(inode)->read_io += count; | 602 | NFS_I(inode)->read_io += count; |
612 | result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); | 603 | result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); |
613 | 604 | ||
614 | inode_unlock(inode); | 605 | nfs_end_io_direct(inode); |
615 | 606 | ||
616 | if (!result) { | 607 | if (!result) { |
617 | result = nfs_direct_wait(dreq); | 608 | result = nfs_direct_wait(dreq); |
@@ -619,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) | |||
619 | iocb->ki_pos += result; | 610 | iocb->ki_pos += result; |
620 | } | 611 | } |
621 | 612 | ||
622 | nfs_direct_req_release(dreq); | ||
623 | return result; | ||
624 | |||
625 | out_release: | 613 | out_release: |
626 | nfs_direct_req_release(dreq); | 614 | nfs_direct_req_release(dreq); |
627 | out_unlock: | ||
628 | inode_unlock(inode); | ||
629 | out: | 615 | out: |
630 | return result; | 616 | return result; |
631 | } | 617 | } |
@@ -657,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) | |||
657 | nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); | 643 | nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); |
658 | 644 | ||
659 | dreq->count = 0; | 645 | dreq->count = 0; |
646 | dreq->verf.committed = NFS_INVALID_STABLE_HOW; | ||
647 | nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); | ||
660 | for (i = 0; i < dreq->mirror_count; i++) | 648 | for (i = 0; i < dreq->mirror_count; i++) |
661 | dreq->mirrors[i].count = 0; | 649 | dreq->mirrors[i].count = 0; |
662 | get_dreq(dreq); | 650 | get_dreq(dreq); |
@@ -775,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) | |||
775 | nfs_direct_write_reschedule(dreq); | 763 | nfs_direct_write_reschedule(dreq); |
776 | break; | 764 | break; |
777 | default: | 765 | default: |
778 | nfs_direct_complete(dreq, true); | 766 | nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); |
767 | nfs_direct_complete(dreq); | ||
779 | } | 768 | } |
780 | } | 769 | } |
781 | 770 | ||
@@ -991,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, | |||
991 | ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | 980 | ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) |
992 | { | 981 | { |
993 | ssize_t result = -EINVAL; | 982 | ssize_t result = -EINVAL; |
983 | size_t count; | ||
994 | struct file *file = iocb->ki_filp; | 984 | struct file *file = iocb->ki_filp; |
995 | struct address_space *mapping = file->f_mapping; | 985 | struct address_space *mapping = file->f_mapping; |
996 | struct inode *inode = mapping->host; | 986 | struct inode *inode = mapping->host; |
@@ -1001,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | |||
1001 | dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", | 991 | dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", |
1002 | file, iov_iter_count(iter), (long long) iocb->ki_pos); | 992 | file, iov_iter_count(iter), (long long) iocb->ki_pos); |
1003 | 993 | ||
1004 | nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, | 994 | result = generic_write_checks(iocb, iter); |
1005 | iov_iter_count(iter)); | 995 | if (result <= 0) |
996 | return result; | ||
997 | count = result; | ||
998 | nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); | ||
1006 | 999 | ||
1007 | pos = iocb->ki_pos; | 1000 | pos = iocb->ki_pos; |
1008 | end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; | 1001 | end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; |
1009 | 1002 | ||
1010 | inode_lock(inode); | 1003 | task_io_account_write(count); |
1011 | |||
1012 | result = nfs_sync_mapping(mapping); | ||
1013 | if (result) | ||
1014 | goto out_unlock; | ||
1015 | |||
1016 | if (mapping->nrpages) { | ||
1017 | result = invalidate_inode_pages2_range(mapping, | ||
1018 | pos >> PAGE_SHIFT, end); | ||
1019 | if (result) | ||
1020 | goto out_unlock; | ||
1021 | } | ||
1022 | |||
1023 | task_io_account_write(iov_iter_count(iter)); | ||
1024 | 1004 | ||
1025 | result = -ENOMEM; | 1005 | result = -ENOMEM; |
1026 | dreq = nfs_direct_req_alloc(); | 1006 | dreq = nfs_direct_req_alloc(); |
1027 | if (!dreq) | 1007 | if (!dreq) |
1028 | goto out_unlock; | 1008 | goto out; |
1029 | 1009 | ||
1030 | dreq->inode = inode; | 1010 | dreq->inode = inode; |
1031 | dreq->bytes_left = dreq->max_count = iov_iter_count(iter); | 1011 | dreq->bytes_left = dreq->max_count = count; |
1032 | dreq->io_start = pos; | 1012 | dreq->io_start = pos; |
1033 | dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); | 1013 | dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); |
1034 | l_ctx = nfs_get_lock_context(dreq->ctx); | 1014 | l_ctx = nfs_get_lock_context(dreq->ctx); |
@@ -1040,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | |||
1040 | if (!is_sync_kiocb(iocb)) | 1020 | if (!is_sync_kiocb(iocb)) |
1041 | dreq->iocb = iocb; | 1021 | dreq->iocb = iocb; |
1042 | 1022 | ||
1023 | nfs_start_io_direct(inode); | ||
1024 | |||
1043 | result = nfs_direct_write_schedule_iovec(dreq, iter, pos); | 1025 | result = nfs_direct_write_schedule_iovec(dreq, iter, pos); |
1044 | 1026 | ||
1045 | if (mapping->nrpages) { | 1027 | if (mapping->nrpages) { |
@@ -1047,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | |||
1047 | pos >> PAGE_SHIFT, end); | 1029 | pos >> PAGE_SHIFT, end); |
1048 | } | 1030 | } |
1049 | 1031 | ||
1050 | inode_unlock(inode); | 1032 | nfs_end_io_direct(inode); |
1051 | 1033 | ||
1052 | if (!result) { | 1034 | if (!result) { |
1053 | result = nfs_direct_wait(dreq); | 1035 | result = nfs_direct_wait(dreq); |
1054 | if (result > 0) { | 1036 | if (result > 0) { |
1055 | struct inode *inode = mapping->host; | ||
1056 | |||
1057 | iocb->ki_pos = pos + result; | 1037 | iocb->ki_pos = pos + result; |
1058 | spin_lock(&inode->i_lock); | ||
1059 | if (i_size_read(inode) < iocb->ki_pos) | ||
1060 | i_size_write(inode, iocb->ki_pos); | ||
1061 | spin_unlock(&inode->i_lock); | ||
1062 | |||
1063 | /* XXX: should check the generic_write_sync retval */ | 1038 | /* XXX: should check the generic_write_sync retval */ |
1064 | generic_write_sync(iocb, result); | 1039 | generic_write_sync(iocb, result); |
1065 | } | 1040 | } |
1066 | } | 1041 | } |
1067 | nfs_direct_req_release(dreq); | ||
1068 | return result; | ||
1069 | |||
1070 | out_release: | 1042 | out_release: |
1071 | nfs_direct_req_release(dreq); | 1043 | nfs_direct_req_release(dreq); |
1072 | out_unlock: | 1044 | out: |
1073 | inode_unlock(inode); | ||
1074 | return result; | 1045 | return result; |
1075 | } | 1046 | } |
1076 | 1047 | ||
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 717a8d6af52d..7d620970f2e1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c | |||
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) | |||
170 | iocb->ki_filp, | 170 | iocb->ki_filp, |
171 | iov_iter_count(to), (unsigned long) iocb->ki_pos); | 171 | iov_iter_count(to), (unsigned long) iocb->ki_pos); |
172 | 172 | ||
173 | result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); | 173 | nfs_start_io_read(inode); |
174 | result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); | ||
174 | if (!result) { | 175 | if (!result) { |
175 | result = generic_file_read_iter(iocb, to); | 176 | result = generic_file_read_iter(iocb, to); |
176 | if (result > 0) | 177 | if (result > 0) |
177 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); | 178 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); |
178 | } | 179 | } |
180 | nfs_end_io_read(inode); | ||
179 | return result; | 181 | return result; |
180 | } | 182 | } |
181 | EXPORT_SYMBOL_GPL(nfs_file_read); | 183 | EXPORT_SYMBOL_GPL(nfs_file_read); |
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, | |||
191 | dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", | 193 | dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", |
192 | filp, (unsigned long) count, (unsigned long long) *ppos); | 194 | filp, (unsigned long) count, (unsigned long long) *ppos); |
193 | 195 | ||
194 | res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); | 196 | nfs_start_io_read(inode); |
197 | res = nfs_revalidate_mapping(inode, filp->f_mapping); | ||
195 | if (!res) { | 198 | if (!res) { |
196 | res = generic_file_splice_read(filp, ppos, pipe, count, flags); | 199 | res = generic_file_splice_read(filp, ppos, pipe, count, flags); |
197 | if (res > 0) | 200 | if (res > 0) |
198 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); | 201 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); |
199 | } | 202 | } |
203 | nfs_end_io_read(inode); | ||
200 | return res; | 204 | return res; |
201 | } | 205 | } |
202 | EXPORT_SYMBOL_GPL(nfs_file_splice_read); | 206 | EXPORT_SYMBOL_GPL(nfs_file_splice_read); |
@@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) | |||
272 | 276 | ||
273 | trace_nfs_fsync_enter(inode); | 277 | trace_nfs_fsync_enter(inode); |
274 | 278 | ||
275 | inode_dio_wait(inode); | ||
276 | do { | 279 | do { |
277 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 280 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); |
278 | if (ret != 0) | 281 | if (ret != 0) |
279 | break; | 282 | break; |
280 | inode_lock(inode); | ||
281 | ret = nfs_file_fsync_commit(file, start, end, datasync); | 283 | ret = nfs_file_fsync_commit(file, start, end, datasync); |
282 | if (!ret) | 284 | if (!ret) |
283 | ret = pnfs_sync_inode(inode, !!datasync); | 285 | ret = pnfs_sync_inode(inode, !!datasync); |
284 | inode_unlock(inode); | ||
285 | /* | 286 | /* |
286 | * If nfs_file_fsync_commit detected a server reboot, then | 287 | * If nfs_file_fsync_commit detected a server reboot, then |
287 | * resend all dirty pages that might have been covered by | 288 | * resend all dirty pages that might have been covered by |
@@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, | |||
359 | file, mapping->host->i_ino, len, (long long) pos); | 360 | file, mapping->host->i_ino, len, (long long) pos); |
360 | 361 | ||
361 | start: | 362 | start: |
362 | /* | ||
363 | * Prevent starvation issues if someone is doing a consistency | ||
364 | * sync-to-disk | ||
365 | */ | ||
366 | ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, | ||
367 | nfs_wait_bit_killable, TASK_KILLABLE); | ||
368 | if (ret) | ||
369 | return ret; | ||
370 | /* | ||
371 | * Wait for O_DIRECT to complete | ||
372 | */ | ||
373 | inode_dio_wait(mapping->host); | ||
374 | |||
375 | page = grab_cache_page_write_begin(mapping, index, flags); | 363 | page = grab_cache_page_write_begin(mapping, index, flags); |
376 | if (!page) | 364 | if (!page) |
377 | return -ENOMEM; | 365 | return -ENOMEM; |
@@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, | |||
432 | return status; | 420 | return status; |
433 | NFS_I(mapping->host)->write_io += copied; | 421 | NFS_I(mapping->host)->write_io += copied; |
434 | 422 | ||
435 | if (nfs_ctx_key_to_expire(ctx)) { | 423 | if (nfs_ctx_key_to_expire(ctx, mapping->host)) { |
436 | status = nfs_wb_all(mapping->host); | 424 | status = nfs_wb_all(mapping->host); |
437 | if (status < 0) | 425 | if (status < 0) |
438 | return status; | 426 | return status; |
@@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, | |||
470 | */ | 458 | */ |
471 | static int nfs_release_page(struct page *page, gfp_t gfp) | 459 | static int nfs_release_page(struct page *page, gfp_t gfp) |
472 | { | 460 | { |
473 | struct address_space *mapping = page->mapping; | ||
474 | |||
475 | dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); | 461 | dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); |
476 | 462 | ||
477 | /* Always try to initiate a 'commit' if relevant, but only | ||
478 | * wait for it if the caller allows blocking. Even then, | ||
479 | * only wait 1 second and only if the 'bdi' is not congested. | ||
480 | * Waiting indefinitely can cause deadlocks when the NFS | ||
481 | * server is on this machine, when a new TCP connection is | ||
482 | * needed and in other rare cases. There is no particular | ||
483 | * need to wait extensively here. A short wait has the | ||
484 | * benefit that someone else can worry about the freezer. | ||
485 | */ | ||
486 | if (mapping) { | ||
487 | struct nfs_server *nfss = NFS_SERVER(mapping->host); | ||
488 | nfs_commit_inode(mapping->host, 0); | ||
489 | if (gfpflags_allow_blocking(gfp) && | ||
490 | !bdi_write_congested(&nfss->backing_dev_info)) { | ||
491 | wait_on_page_bit_killable_timeout(page, PG_private, | ||
492 | HZ); | ||
493 | if (PagePrivate(page)) | ||
494 | set_bdi_congested(&nfss->backing_dev_info, | ||
495 | BLK_RW_ASYNC); | ||
496 | } | ||
497 | } | ||
498 | /* If PagePrivate() is set, then the page is not freeable */ | 463 | /* If PagePrivate() is set, then the page is not freeable */ |
499 | if (PagePrivate(page)) | 464 | if (PagePrivate(page)) |
500 | return 0; | 465 | return 0; |
@@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
604 | filp, filp->f_mapping->host->i_ino, | 569 | filp, filp->f_mapping->host->i_ino, |
605 | (long long)page_offset(page)); | 570 | (long long)page_offset(page)); |
606 | 571 | ||
572 | sb_start_pagefault(inode->i_sb); | ||
573 | |||
607 | /* make sure the cache has finished storing the page */ | 574 | /* make sure the cache has finished storing the page */ |
608 | nfs_fscache_wait_on_page_write(NFS_I(inode), page); | 575 | nfs_fscache_wait_on_page_write(NFS_I(inode), page); |
609 | 576 | ||
@@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
630 | out_unlock: | 597 | out_unlock: |
631 | unlock_page(page); | 598 | unlock_page(page); |
632 | out: | 599 | out: |
600 | sb_end_pagefault(inode->i_sb); | ||
633 | return ret; | 601 | return ret; |
634 | } | 602 | } |
635 | 603 | ||
@@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode) | |||
645 | 613 | ||
646 | ctx = nfs_file_open_context(filp); | 614 | ctx = nfs_file_open_context(filp); |
647 | if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || | 615 | if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || |
648 | nfs_ctx_key_to_expire(ctx)) | 616 | nfs_ctx_key_to_expire(ctx, inode)) |
649 | return 1; | 617 | return 1; |
650 | return 0; | 618 | return 0; |
651 | } | 619 | } |
@@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) | |||
656 | struct inode *inode = file_inode(file); | 624 | struct inode *inode = file_inode(file); |
657 | unsigned long written = 0; | 625 | unsigned long written = 0; |
658 | ssize_t result; | 626 | ssize_t result; |
659 | size_t count = iov_iter_count(from); | ||
660 | 627 | ||
661 | result = nfs_key_timeout_notify(file, inode); | 628 | result = nfs_key_timeout_notify(file, inode); |
662 | if (result) | 629 | if (result) |
663 | return result; | 630 | return result; |
664 | 631 | ||
665 | if (iocb->ki_flags & IOCB_DIRECT) { | 632 | if (iocb->ki_flags & IOCB_DIRECT) |
666 | result = generic_write_checks(iocb, from); | ||
667 | if (result <= 0) | ||
668 | return result; | ||
669 | return nfs_file_direct_write(iocb, from); | 633 | return nfs_file_direct_write(iocb, from); |
670 | } | ||
671 | 634 | ||
672 | dprintk("NFS: write(%pD2, %zu@%Ld)\n", | 635 | dprintk("NFS: write(%pD2, %zu@%Ld)\n", |
673 | file, count, (long long) iocb->ki_pos); | 636 | file, iov_iter_count(from), (long long) iocb->ki_pos); |
674 | 637 | ||
675 | result = -EBUSY; | ||
676 | if (IS_SWAPFILE(inode)) | 638 | if (IS_SWAPFILE(inode)) |
677 | goto out_swapfile; | 639 | goto out_swapfile; |
678 | /* | 640 | /* |
@@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) | |||
684 | goto out; | 646 | goto out; |
685 | } | 647 | } |
686 | 648 | ||
687 | result = count; | 649 | nfs_start_io_write(inode); |
688 | if (!count) | 650 | result = generic_write_checks(iocb, from); |
651 | if (result > 0) { | ||
652 | current->backing_dev_info = inode_to_bdi(inode); | ||
653 | result = generic_perform_write(file, from, iocb->ki_pos); | ||
654 | current->backing_dev_info = NULL; | ||
655 | } | ||
656 | nfs_end_io_write(inode); | ||
657 | if (result <= 0) | ||
689 | goto out; | 658 | goto out; |
690 | 659 | ||
691 | result = generic_file_write_iter(iocb, from); | 660 | written = generic_write_sync(iocb, result); |
692 | if (result > 0) | 661 | iocb->ki_pos += written; |
693 | written = result; | ||
694 | 662 | ||
695 | /* Return error values */ | 663 | /* Return error values */ |
696 | if (result >= 0 && nfs_need_check_write(file, inode)) { | 664 | if (nfs_need_check_write(file, inode)) { |
697 | int err = vfs_fsync(file, 0); | 665 | int err = vfs_fsync(file, 0); |
698 | if (err < 0) | 666 | if (err < 0) |
699 | result = err; | 667 | result = err; |
700 | } | 668 | } |
701 | if (result > 0) | 669 | nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); |
702 | nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); | ||
703 | out: | 670 | out: |
704 | return result; | 671 | return result; |
705 | 672 | ||
706 | out_swapfile: | 673 | out_swapfile: |
707 | printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); | 674 | printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); |
708 | goto out; | 675 | return -EBUSY; |
709 | } | 676 | } |
710 | EXPORT_SYMBOL_GPL(nfs_file_write); | 677 | EXPORT_SYMBOL_GPL(nfs_file_write); |
711 | 678 | ||
@@ -780,11 +747,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) | |||
780 | } | 747 | } |
781 | 748 | ||
782 | static int | 749 | static int |
783 | is_time_granular(struct timespec *ts) { | ||
784 | return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); | ||
785 | } | ||
786 | |||
787 | static int | ||
788 | do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) | 750 | do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) |
789 | { | 751 | { |
790 | struct inode *inode = filp->f_mapping->host; | 752 | struct inode *inode = filp->f_mapping->host; |
@@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) | |||
817 | * This makes locking act as a cache coherency point. | 779 | * This makes locking act as a cache coherency point. |
818 | */ | 780 | */ |
819 | nfs_sync_mapping(filp->f_mapping); | 781 | nfs_sync_mapping(filp->f_mapping); |
820 | if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { | 782 | if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) |
821 | if (is_time_granular(&NFS_SERVER(inode)->time_delta)) | 783 | nfs_zap_mapping(inode, filp->f_mapping); |
822 | __nfs_revalidate_inode(NFS_SERVER(inode), inode); | ||
823 | else | ||
824 | nfs_zap_caches(inode); | ||
825 | } | ||
826 | out: | 784 | out: |
827 | return status; | 785 | return status; |
828 | } | 786 | } |
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index aa59757389dc..a3fc48ba4931 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c | |||
@@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task, | |||
255 | static void | 255 | static void |
256 | filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) | 256 | filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) |
257 | { | 257 | { |
258 | loff_t end_offs = 0; | ||
258 | 259 | ||
259 | if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || | 260 | if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || |
260 | hdr->res.verf->committed != NFS_DATA_SYNC) | 261 | hdr->res.verf->committed == NFS_FILE_SYNC) |
261 | return; | 262 | return; |
263 | if (hdr->res.verf->committed == NFS_DATA_SYNC) | ||
264 | end_offs = hdr->mds_offset + (loff_t)hdr->res.count; | ||
262 | 265 | ||
263 | pnfs_set_layoutcommit(hdr->inode, hdr->lseg, | 266 | /* Note: if the write is unstable, don't set end_offs until commit */ |
264 | hdr->mds_offset + hdr->res.count); | 267 | pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); |
265 | dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, | 268 | dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, |
266 | (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); | 269 | (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); |
267 | } | 270 | } |
@@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task, | |||
354 | } | 357 | } |
355 | 358 | ||
356 | filelayout_set_layoutcommit(hdr); | 359 | filelayout_set_layoutcommit(hdr); |
360 | |||
361 | /* zero out the fattr */ | ||
362 | hdr->fattr.valid = 0; | ||
363 | if (task->tk_status >= 0) | ||
364 | nfs_writeback_update_inode(hdr); | ||
365 | |||
357 | return 0; | 366 | return 0; |
358 | } | 367 | } |
359 | 368 | ||
@@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, | |||
375 | return -EAGAIN; | 384 | return -EAGAIN; |
376 | } | 385 | } |
377 | 386 | ||
378 | if (data->verf.committed == NFS_UNSTABLE) | 387 | pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); |
379 | pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); | ||
380 | 388 | ||
381 | return 0; | 389 | return 0; |
382 | } | 390 | } |
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0e8018bc9880..e6206eaf2bdf 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c | |||
@@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) | |||
1325 | * we always send layoutcommit after DS writes. | 1325 | * we always send layoutcommit after DS writes. |
1326 | */ | 1326 | */ |
1327 | static void | 1327 | static void |
1328 | ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) | 1328 | ff_layout_set_layoutcommit(struct inode *inode, |
1329 | struct pnfs_layout_segment *lseg, | ||
1330 | loff_t end_offset) | ||
1329 | { | 1331 | { |
1330 | if (!ff_layout_need_layoutcommit(hdr->lseg)) | 1332 | if (!ff_layout_need_layoutcommit(lseg)) |
1331 | return; | 1333 | return; |
1332 | 1334 | ||
1333 | pnfs_set_layoutcommit(hdr->inode, hdr->lseg, | 1335 | pnfs_set_layoutcommit(inode, lseg, end_offset); |
1334 | hdr->mds_offset + hdr->res.count); | 1336 | dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, |
1335 | dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, | 1337 | (unsigned long long) NFS_I(inode)->layout->plh_lwb); |
1336 | (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); | ||
1337 | } | 1338 | } |
1338 | 1339 | ||
1339 | static bool | 1340 | static bool |
@@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data) | |||
1469 | static int ff_layout_write_done_cb(struct rpc_task *task, | 1470 | static int ff_layout_write_done_cb(struct rpc_task *task, |
1470 | struct nfs_pgio_header *hdr) | 1471 | struct nfs_pgio_header *hdr) |
1471 | { | 1472 | { |
1473 | loff_t end_offs = 0; | ||
1472 | int err; | 1474 | int err; |
1473 | 1475 | ||
1474 | trace_nfs4_pnfs_write(hdr, task->tk_status); | 1476 | trace_nfs4_pnfs_write(hdr, task->tk_status); |
@@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task, | |||
1494 | 1496 | ||
1495 | if (hdr->res.verf->committed == NFS_FILE_SYNC || | 1497 | if (hdr->res.verf->committed == NFS_FILE_SYNC || |
1496 | hdr->res.verf->committed == NFS_DATA_SYNC) | 1498 | hdr->res.verf->committed == NFS_DATA_SYNC) |
1497 | ff_layout_set_layoutcommit(hdr); | 1499 | end_offs = hdr->mds_offset + (loff_t)hdr->res.count; |
1500 | |||
1501 | /* Note: if the write is unstable, don't set end_offs until commit */ | ||
1502 | ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); | ||
1498 | 1503 | ||
1499 | /* zero out fattr since we don't care DS attr at all */ | 1504 | /* zero out fattr since we don't care DS attr at all */ |
1500 | hdr->fattr.valid = 0; | 1505 | hdr->fattr.valid = 0; |
@@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, | |||
1530 | return -EAGAIN; | 1535 | return -EAGAIN; |
1531 | } | 1536 | } |
1532 | 1537 | ||
1533 | if (data->verf.committed == NFS_UNSTABLE | 1538 | ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); |
1534 | && ff_layout_need_layoutcommit(data->lseg)) | ||
1535 | pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); | ||
1536 | 1539 | ||
1537 | return 0; | 1540 | return 0; |
1538 | } | 1541 | } |
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index dda689d7a8a7..bf4ec5ecc97e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c | |||
@@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) | |||
662 | trace_nfs_getattr_enter(inode); | 662 | trace_nfs_getattr_enter(inode); |
663 | /* Flush out writes to the server in order to update c/mtime. */ | 663 | /* Flush out writes to the server in order to update c/mtime. */ |
664 | if (S_ISREG(inode->i_mode)) { | 664 | if (S_ISREG(inode->i_mode)) { |
665 | inode_lock(inode); | 665 | err = filemap_write_and_wait(inode->i_mapping); |
666 | err = nfs_sync_inode(inode); | ||
667 | inode_unlock(inode); | ||
668 | if (err) | 666 | if (err) |
669 | goto out; | 667 | goto out; |
670 | } | 668 | } |
@@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) | |||
879 | struct nfs_inode *nfsi = NFS_I(inode); | 877 | struct nfs_inode *nfsi = NFS_I(inode); |
880 | 878 | ||
881 | spin_lock(&inode->i_lock); | 879 | spin_lock(&inode->i_lock); |
882 | list_add(&ctx->list, &nfsi->open_files); | 880 | if (ctx->mode & FMODE_WRITE) |
881 | list_add(&ctx->list, &nfsi->open_files); | ||
882 | else | ||
883 | list_add_tail(&ctx->list, &nfsi->open_files); | ||
883 | spin_unlock(&inode->i_lock); | 884 | spin_unlock(&inode->i_lock); |
884 | } | 885 | } |
885 | EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); | 886 | EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); |
@@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) | |||
972 | if (NFS_STALE(inode)) | 973 | if (NFS_STALE(inode)) |
973 | goto out; | 974 | goto out; |
974 | 975 | ||
976 | /* pNFS: Attributes aren't updated until we layoutcommit */ | ||
977 | if (S_ISREG(inode->i_mode)) { | ||
978 | status = pnfs_sync_inode(inode, false); | ||
979 | if (status) | ||
980 | goto out; | ||
981 | } | ||
982 | |||
975 | status = -ENOMEM; | 983 | status = -ENOMEM; |
976 | fattr = nfs_alloc_fattr(); | 984 | fattr = nfs_alloc_fattr(); |
977 | if (fattr == NULL) | 985 | if (fattr == NULL) |
@@ -1122,14 +1130,12 @@ out: | |||
1122 | } | 1130 | } |
1123 | 1131 | ||
1124 | /** | 1132 | /** |
1125 | * __nfs_revalidate_mapping - Revalidate the pagecache | 1133 | * nfs_revalidate_mapping - Revalidate the pagecache |
1126 | * @inode - pointer to host inode | 1134 | * @inode - pointer to host inode |
1127 | * @mapping - pointer to mapping | 1135 | * @mapping - pointer to mapping |
1128 | * @may_lock - take inode->i_mutex? | ||
1129 | */ | 1136 | */ |
1130 | static int __nfs_revalidate_mapping(struct inode *inode, | 1137 | int nfs_revalidate_mapping(struct inode *inode, |
1131 | struct address_space *mapping, | 1138 | struct address_space *mapping) |
1132 | bool may_lock) | ||
1133 | { | 1139 | { |
1134 | struct nfs_inode *nfsi = NFS_I(inode); | 1140 | struct nfs_inode *nfsi = NFS_I(inode); |
1135 | unsigned long *bitlock = &nfsi->flags; | 1141 | unsigned long *bitlock = &nfsi->flags; |
@@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode, | |||
1178 | nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; | 1184 | nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; |
1179 | spin_unlock(&inode->i_lock); | 1185 | spin_unlock(&inode->i_lock); |
1180 | trace_nfs_invalidate_mapping_enter(inode); | 1186 | trace_nfs_invalidate_mapping_enter(inode); |
1181 | if (may_lock) { | 1187 | ret = nfs_invalidate_mapping(inode, mapping); |
1182 | inode_lock(inode); | ||
1183 | ret = nfs_invalidate_mapping(inode, mapping); | ||
1184 | inode_unlock(inode); | ||
1185 | } else | ||
1186 | ret = nfs_invalidate_mapping(inode, mapping); | ||
1187 | trace_nfs_invalidate_mapping_exit(inode, ret); | 1188 | trace_nfs_invalidate_mapping_exit(inode, ret); |
1188 | 1189 | ||
1189 | clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); | 1190 | clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); |
@@ -1193,27 +1194,28 @@ out: | |||
1193 | return ret; | 1194 | return ret; |
1194 | } | 1195 | } |
1195 | 1196 | ||
1196 | /** | 1197 | static bool nfs_file_has_writers(struct nfs_inode *nfsi) |
1197 | * nfs_revalidate_mapping - Revalidate the pagecache | ||
1198 | * @inode - pointer to host inode | ||
1199 | * @mapping - pointer to mapping | ||
1200 | */ | ||
1201 | int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) | ||
1202 | { | 1198 | { |
1203 | return __nfs_revalidate_mapping(inode, mapping, false); | 1199 | struct inode *inode = &nfsi->vfs_inode; |
1200 | |||
1201 | assert_spin_locked(&inode->i_lock); | ||
1202 | |||
1203 | if (!S_ISREG(inode->i_mode)) | ||
1204 | return false; | ||
1205 | if (list_empty(&nfsi->open_files)) | ||
1206 | return false; | ||
1207 | /* Note: This relies on nfsi->open_files being ordered with writers | ||
1208 | * being placed at the head of the list. | ||
1209 | * See nfs_inode_attach_open_context() | ||
1210 | */ | ||
1211 | return (list_first_entry(&nfsi->open_files, | ||
1212 | struct nfs_open_context, | ||
1213 | list)->mode & FMODE_WRITE) == FMODE_WRITE; | ||
1204 | } | 1214 | } |
1205 | 1215 | ||
1206 | /** | 1216 | static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) |
1207 | * nfs_revalidate_mapping_protected - Revalidate the pagecache | ||
1208 | * @inode - pointer to host inode | ||
1209 | * @mapping - pointer to mapping | ||
1210 | * | ||
1211 | * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex | ||
1212 | * while invalidating the mapping. | ||
1213 | */ | ||
1214 | int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) | ||
1215 | { | 1217 | { |
1216 | return __nfs_revalidate_mapping(inode, mapping, true); | 1218 | return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); |
1217 | } | 1219 | } |
1218 | 1220 | ||
1219 | static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) | 1221 | static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) |
@@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat | |||
1280 | if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) | 1282 | if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) |
1281 | return -EIO; | 1283 | return -EIO; |
1282 | 1284 | ||
1283 | if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && | 1285 | if (!nfs_file_has_buffered_writers(nfsi)) { |
1284 | inode->i_version != fattr->change_attr) | 1286 | /* Verify a few of the more important attributes */ |
1285 | invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; | 1287 | if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) |
1288 | invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; | ||
1286 | 1289 | ||
1287 | /* Verify a few of the more important attributes */ | 1290 | if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) |
1288 | if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) | 1291 | invalid |= NFS_INO_INVALID_ATTR; |
1289 | invalid |= NFS_INO_INVALID_ATTR; | ||
1290 | 1292 | ||
1291 | if (fattr->valid & NFS_ATTR_FATTR_SIZE) { | 1293 | if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) |
1292 | cur_size = i_size_read(inode); | 1294 | invalid |= NFS_INO_INVALID_ATTR; |
1293 | new_isize = nfs_size_to_loff_t(fattr->size); | 1295 | |
1294 | if (cur_size != new_isize) | 1296 | if (fattr->valid & NFS_ATTR_FATTR_SIZE) { |
1295 | invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; | 1297 | cur_size = i_size_read(inode); |
1298 | new_isize = nfs_size_to_loff_t(fattr->size); | ||
1299 | if (cur_size != new_isize) | ||
1300 | invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; | ||
1301 | } | ||
1296 | } | 1302 | } |
1297 | if (nfsi->nrequests != 0) | ||
1298 | invalid &= ~NFS_INO_REVAL_PAGECACHE; | ||
1299 | 1303 | ||
1300 | /* Have any file permissions changed? */ | 1304 | /* Have any file permissions changed? */ |
1301 | if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) | 1305 | if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) |
@@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n | |||
1470 | ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); | 1474 | ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); |
1471 | } | 1475 | } |
1472 | 1476 | ||
1473 | /* | ||
1474 | * Don't trust the change_attribute, mtime, ctime or size if | ||
1475 | * a pnfs LAYOUTCOMMIT is outstanding | ||
1476 | */ | ||
1477 | static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, | ||
1478 | struct nfs_fattr *fattr) | ||
1479 | { | ||
1480 | if (pnfs_layoutcommit_outstanding(inode)) | ||
1481 | fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | | ||
1482 | NFS_ATTR_FATTR_MTIME | | ||
1483 | NFS_ATTR_FATTR_CTIME | | ||
1484 | NFS_ATTR_FATTR_SIZE); | ||
1485 | } | ||
1486 | |||
1487 | static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) | 1477 | static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) |
1488 | { | 1478 | { |
1489 | int ret; | 1479 | int ret; |
1490 | 1480 | ||
1491 | trace_nfs_refresh_inode_enter(inode); | 1481 | trace_nfs_refresh_inode_enter(inode); |
1492 | 1482 | ||
1493 | nfs_inode_attrs_handle_layoutcommit(inode, fattr); | ||
1494 | |||
1495 | if (nfs_inode_attrs_need_update(inode, fattr)) | 1483 | if (nfs_inode_attrs_need_update(inode, fattr)) |
1496 | ret = nfs_update_inode(inode, fattr); | 1484 | ret = nfs_update_inode(inode, fattr); |
1497 | else | 1485 | else |
@@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); | |||
1527 | 1515 | ||
1528 | static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) | 1516 | static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) |
1529 | { | 1517 | { |
1530 | unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; | 1518 | unsigned long invalid = NFS_INO_INVALID_ATTR; |
1531 | 1519 | ||
1532 | /* | 1520 | /* |
1533 | * Don't revalidate the pagecache if we hold a delegation, but do | 1521 | * Don't revalidate the pagecache if we hold a delegation, but do |
@@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | |||
1676 | unsigned long invalid = 0; | 1664 | unsigned long invalid = 0; |
1677 | unsigned long now = jiffies; | 1665 | unsigned long now = jiffies; |
1678 | unsigned long save_cache_validity; | 1666 | unsigned long save_cache_validity; |
1667 | bool have_writers = nfs_file_has_buffered_writers(nfsi); | ||
1679 | bool cache_revalidated = true; | 1668 | bool cache_revalidated = true; |
1680 | 1669 | ||
1681 | dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", | 1670 | dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", |
@@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | |||
1725 | /* Do atomic weak cache consistency updates */ | 1714 | /* Do atomic weak cache consistency updates */ |
1726 | invalid |= nfs_wcc_update_inode(inode, fattr); | 1715 | invalid |= nfs_wcc_update_inode(inode, fattr); |
1727 | 1716 | ||
1717 | if (pnfs_layoutcommit_outstanding(inode)) { | ||
1718 | nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; | ||
1719 | cache_revalidated = false; | ||
1720 | } | ||
1721 | |||
1728 | /* More cache consistency checks */ | 1722 | /* More cache consistency checks */ |
1729 | if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { | 1723 | if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { |
1730 | if (inode->i_version != fattr->change_attr) { | 1724 | if (inode->i_version != fattr->change_attr) { |
1731 | dprintk("NFS: change_attr change on server for file %s/%ld\n", | 1725 | dprintk("NFS: change_attr change on server for file %s/%ld\n", |
1732 | inode->i_sb->s_id, inode->i_ino); | 1726 | inode->i_sb->s_id, inode->i_ino); |
1733 | invalid |= NFS_INO_INVALID_ATTR | 1727 | /* Could it be a race with writeback? */ |
1734 | | NFS_INO_INVALID_DATA | 1728 | if (!have_writers) { |
1735 | | NFS_INO_INVALID_ACCESS | 1729 | invalid |= NFS_INO_INVALID_ATTR |
1736 | | NFS_INO_INVALID_ACL; | 1730 | | NFS_INO_INVALID_DATA |
1737 | if (S_ISDIR(inode->i_mode)) | 1731 | | NFS_INO_INVALID_ACCESS |
1738 | nfs_force_lookup_revalidate(inode); | 1732 | | NFS_INO_INVALID_ACL; |
1733 | if (S_ISDIR(inode->i_mode)) | ||
1734 | nfs_force_lookup_revalidate(inode); | ||
1735 | } | ||
1739 | inode->i_version = fattr->change_attr; | 1736 | inode->i_version = fattr->change_attr; |
1740 | } | 1737 | } |
1741 | } else { | 1738 | } else { |
@@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | |||
1768 | if (new_isize != cur_isize) { | 1765 | if (new_isize != cur_isize) { |
1769 | /* Do we perhaps have any outstanding writes, or has | 1766 | /* Do we perhaps have any outstanding writes, or has |
1770 | * the file grown beyond our last write? */ | 1767 | * the file grown beyond our last write? */ |
1771 | if ((nfsi->nrequests == 0) || new_isize > cur_isize) { | 1768 | if (nfsi->nrequests == 0 || new_isize > cur_isize) { |
1772 | i_size_write(inode, new_isize); | 1769 | i_size_write(inode, new_isize); |
1773 | invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; | 1770 | if (!have_writers) |
1771 | invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; | ||
1774 | } | 1772 | } |
1775 | dprintk("NFS: isize change on server for file %s/%ld " | 1773 | dprintk("NFS: isize change on server for file %s/%ld " |
1776 | "(%Ld to %Ld)\n", | 1774 | "(%Ld to %Ld)\n", |
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ea04d87fc65..7ce5e023c3c3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h | |||
@@ -66,13 +66,16 @@ struct nfs_clone_mount { | |||
66 | 66 | ||
67 | struct nfs_client_initdata { | 67 | struct nfs_client_initdata { |
68 | unsigned long init_flags; | 68 | unsigned long init_flags; |
69 | const char *hostname; | 69 | const char *hostname; /* Hostname of the server */ |
70 | const struct sockaddr *addr; | 70 | const struct sockaddr *addr; /* Address of the server */ |
71 | const char *nodename; /* Hostname of the client */ | ||
72 | const char *ip_addr; /* IP address of the client */ | ||
71 | size_t addrlen; | 73 | size_t addrlen; |
72 | struct nfs_subversion *nfs_mod; | 74 | struct nfs_subversion *nfs_mod; |
73 | int proto; | 75 | int proto; |
74 | u32 minorversion; | 76 | u32 minorversion; |
75 | struct net *net; | 77 | struct net *net; |
78 | const struct rpc_timeout *timeparms; | ||
76 | }; | 79 | }; |
77 | 80 | ||
78 | /* | 81 | /* |
@@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info); | |||
147 | extern const struct rpc_program nfs_program; | 150 | extern const struct rpc_program nfs_program; |
148 | extern void nfs_clients_init(struct net *net); | 151 | extern void nfs_clients_init(struct net *net); |
149 | extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); | 152 | extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); |
150 | int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); | 153 | int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); |
151 | struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, | 154 | struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, |
152 | const struct rpc_timeout *, const char *, | ||
153 | rpc_authflavor_t); | 155 | rpc_authflavor_t); |
154 | int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); | 156 | int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); |
155 | void nfs_server_insert_lists(struct nfs_server *); | 157 | void nfs_server_insert_lists(struct nfs_server *); |
@@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, | |||
184 | rpc_authflavor_t); | 186 | rpc_authflavor_t); |
185 | extern int nfs_wait_client_init_complete(const struct nfs_client *clp); | 187 | extern int nfs_wait_client_init_complete(const struct nfs_client *clp); |
186 | extern void nfs_mark_client_ready(struct nfs_client *clp, int state); | 188 | extern void nfs_mark_client_ready(struct nfs_client *clp, int state); |
187 | extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, | 189 | extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, |
188 | const struct sockaddr *ds_addr, | 190 | const struct sockaddr *ds_addr, |
189 | int ds_addrlen, int ds_proto, | 191 | int ds_addrlen, int ds_proto, |
190 | unsigned int ds_timeo, | 192 | unsigned int ds_timeo, |
@@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, | |||
193 | rpc_authflavor_t au_flavor); | 195 | rpc_authflavor_t au_flavor); |
194 | extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, | 196 | extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, |
195 | struct inode *); | 197 | struct inode *); |
196 | extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, | 198 | extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, |
197 | const struct sockaddr *ds_addr, int ds_addrlen, | 199 | const struct sockaddr *ds_addr, int ds_addrlen, |
198 | int ds_proto, unsigned int ds_timeo, | 200 | int ds_proto, unsigned int ds_timeo, |
199 | unsigned int ds_retrans, rpc_authflavor_t au_flavor); | 201 | unsigned int ds_retrans, rpc_authflavor_t au_flavor); |
@@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) | |||
338 | /* proc.c */ | 340 | /* proc.c */ |
339 | void nfs_close_context(struct nfs_open_context *ctx, int is_sync); | 341 | void nfs_close_context(struct nfs_open_context *ctx, int is_sync); |
340 | extern struct nfs_client *nfs_init_client(struct nfs_client *clp, | 342 | extern struct nfs_client *nfs_init_client(struct nfs_client *clp, |
341 | const struct rpc_timeout *timeparms, | 343 | const struct nfs_client_initdata *); |
342 | const char *ip_addr); | ||
343 | 344 | ||
344 | /* dir.c */ | 345 | /* dir.c */ |
345 | extern void nfs_force_use_readdirplus(struct inode *dir); | 346 | extern void nfs_force_use_readdirplus(struct inode *dir); |
@@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void); | |||
411 | extern bool nfs_sb_active(struct super_block *sb); | 412 | extern bool nfs_sb_active(struct super_block *sb); |
412 | extern void nfs_sb_deactive(struct super_block *sb); | 413 | extern void nfs_sb_deactive(struct super_block *sb); |
413 | 414 | ||
415 | /* io.c */ | ||
416 | extern void nfs_start_io_read(struct inode *inode); | ||
417 | extern void nfs_end_io_read(struct inode *inode); | ||
418 | extern void nfs_start_io_write(struct inode *inode); | ||
419 | extern void nfs_end_io_write(struct inode *inode); | ||
420 | extern void nfs_start_io_direct(struct inode *inode); | ||
421 | extern void nfs_end_io_direct(struct inode *inode); | ||
422 | |||
423 | static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) | ||
424 | { | ||
425 | return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; | ||
426 | } | ||
427 | |||
414 | /* namespace.c */ | 428 | /* namespace.c */ |
415 | #define NFS_PATH_CANONICAL 1 | 429 | #define NFS_PATH_CANONICAL 1 |
416 | extern char *nfs_path(char **p, struct dentry *dentry, | 430 | extern char *nfs_path(char **p, struct dentry *dentry, |
@@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo, | |||
496 | struct inode *inode, | 510 | struct inode *inode, |
497 | struct nfs_direct_req *dreq); | 511 | struct nfs_direct_req *dreq); |
498 | int nfs_key_timeout_notify(struct file *filp, struct inode *inode); | 512 | int nfs_key_timeout_notify(struct file *filp, struct inode *inode); |
499 | bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); | 513 | bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); |
500 | void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); | 514 | void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); |
501 | 515 | ||
516 | int nfs_filemap_write_and_wait_range(struct address_space *mapping, | ||
517 | loff_t lstart, loff_t lend); | ||
518 | |||
519 | #ifdef CONFIG_NFS_V4_1 | ||
520 | static inline | ||
521 | void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) | ||
522 | { | ||
523 | int i; | ||
524 | |||
525 | for (i = 0; i < cinfo->nbuckets; i++) | ||
526 | cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; | ||
527 | } | ||
528 | #else | ||
529 | static inline | ||
530 | void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) | ||
531 | { | ||
532 | } | ||
533 | #endif | ||
534 | |||
535 | |||
502 | #ifdef CONFIG_MIGRATION | 536 | #ifdef CONFIG_MIGRATION |
503 | extern int nfs_migrate_page(struct address_space *, | 537 | extern int nfs_migrate_page(struct address_space *, |
504 | struct page *, struct page *, enum migrate_mode); | 538 | struct page *, struct page *, enum migrate_mode); |
@@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *, | |||
506 | #define nfs_migrate_page NULL | 540 | #define nfs_migrate_page NULL |
507 | #endif | 541 | #endif |
508 | 542 | ||
543 | static inline int | ||
544 | nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, | ||
545 | const struct nfs_write_verifier *v2) | ||
546 | { | ||
547 | return memcmp(v1->data, v2->data, sizeof(v1->data)); | ||
548 | } | ||
549 | |||
509 | /* unlink.c */ | 550 | /* unlink.c */ |
510 | extern struct rpc_task * | 551 | extern struct rpc_task * |
511 | nfs_async_rename(struct inode *old_dir, struct inode *new_dir, | 552 | nfs_async_rename(struct inode *old_dir, struct inode *new_dir, |
@@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); | |||
521 | /* nfs4proc.c */ | 562 | /* nfs4proc.c */ |
522 | extern void __nfs4_read_done_cb(struct nfs_pgio_header *); | 563 | extern void __nfs4_read_done_cb(struct nfs_pgio_header *); |
523 | extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, | 564 | extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, |
524 | const struct rpc_timeout *timeparms, | 565 | const struct nfs_client_initdata *); |
525 | const char *ip_addr); | ||
526 | extern int nfs40_walk_client_list(struct nfs_client *clp, | 566 | extern int nfs40_walk_client_list(struct nfs_client *clp, |
527 | struct nfs_client **result, | 567 | struct nfs_client **result, |
528 | struct rpc_cred *cred); | 568 | struct rpc_cred *cred); |
diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 000000000000..1fc5d1ce327e --- /dev/null +++ b/fs/nfs/io.c | |||
@@ -0,0 +1,147 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2016 Trond Myklebust | ||
3 | * | ||
4 | * I/O and data path helper functionality. | ||
5 | */ | ||
6 | |||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/bitops.h> | ||
10 | #include <linux/rwsem.h> | ||
11 | #include <linux/fs.h> | ||
12 | #include <linux/nfs_fs.h> | ||
13 | |||
14 | #include "internal.h" | ||
15 | |||
16 | /* Call with exclusively locked inode->i_rwsem */ | ||
17 | static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) | ||
18 | { | ||
19 | if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { | ||
20 | clear_bit(NFS_INO_ODIRECT, &nfsi->flags); | ||
21 | inode_dio_wait(inode); | ||
22 | } | ||
23 | } | ||
24 | |||
25 | /** | ||
26 | * nfs_start_io_read - declare the file is being used for buffered reads | ||
27 | * @inode - file inode | ||
28 | * | ||
29 | * Declare that a buffered read operation is about to start, and ensure | ||
30 | * that we block all direct I/O. | ||
31 | * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, | ||
32 | * and holds a shared lock on inode->i_rwsem to ensure that the flag | ||
33 | * cannot be changed. | ||
34 | * In practice, this means that buffered read operations are allowed to | ||
35 | * execute in parallel, thanks to the shared lock, whereas direct I/O | ||
36 | * operations need to wait to grab an exclusive lock in order to set | ||
37 | * NFS_INO_ODIRECT. | ||
38 | * Note that buffered writes and truncates both take a write lock on | ||
39 | * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. | ||
40 | */ | ||
41 | void | ||
42 | nfs_start_io_read(struct inode *inode) | ||
43 | { | ||
44 | struct nfs_inode *nfsi = NFS_I(inode); | ||
45 | /* Be an optimist! */ | ||
46 | down_read(&inode->i_rwsem); | ||
47 | if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) | ||
48 | return; | ||
49 | up_read(&inode->i_rwsem); | ||
50 | /* Slow path.... */ | ||
51 | down_write(&inode->i_rwsem); | ||
52 | nfs_block_o_direct(nfsi, inode); | ||
53 | downgrade_write(&inode->i_rwsem); | ||
54 | } | ||
55 | |||
56 | /** | ||
57 | * nfs_end_io_read - declare that the buffered read operation is done | ||
58 | * @inode - file inode | ||
59 | * | ||
60 | * Declare that a buffered read operation is done, and release the shared | ||
61 | * lock on inode->i_rwsem. | ||
62 | */ | ||
63 | void | ||
64 | nfs_end_io_read(struct inode *inode) | ||
65 | { | ||
66 | up_read(&inode->i_rwsem); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * nfs_start_io_write - declare the file is being used for buffered writes | ||
71 | * @inode - file inode | ||
72 | * | ||
73 | * Declare that a buffered read operation is about to start, and ensure | ||
74 | * that we block all direct I/O. | ||
75 | */ | ||
76 | void | ||
77 | nfs_start_io_write(struct inode *inode) | ||
78 | { | ||
79 | down_write(&inode->i_rwsem); | ||
80 | nfs_block_o_direct(NFS_I(inode), inode); | ||
81 | } | ||
82 | |||
83 | /** | ||
84 | * nfs_end_io_write - declare that the buffered write operation is done | ||
85 | * @inode - file inode | ||
86 | * | ||
87 | * Declare that a buffered write operation is done, and release the | ||
88 | * lock on inode->i_rwsem. | ||
89 | */ | ||
90 | void | ||
91 | nfs_end_io_write(struct inode *inode) | ||
92 | { | ||
93 | up_write(&inode->i_rwsem); | ||
94 | } | ||
95 | |||
96 | /* Call with exclusively locked inode->i_rwsem */ | ||
97 | static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) | ||
98 | { | ||
99 | if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { | ||
100 | set_bit(NFS_INO_ODIRECT, &nfsi->flags); | ||
101 | nfs_wb_all(inode); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * nfs_end_io_direct - declare the file is being used for direct i/o | ||
107 | * @inode - file inode | ||
108 | * | ||
109 | * Declare that a direct I/O operation is about to start, and ensure | ||
110 | * that we block all buffered I/O. | ||
111 | * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, | ||
112 | * and holds a shared lock on inode->i_rwsem to ensure that the flag | ||
113 | * cannot be changed. | ||
114 | * In practice, this means that direct I/O operations are allowed to | ||
115 | * execute in parallel, thanks to the shared lock, whereas buffered I/O | ||
116 | * operations need to wait to grab an exclusive lock in order to clear | ||
117 | * NFS_INO_ODIRECT. | ||
118 | * Note that buffered writes and truncates both take a write lock on | ||
119 | * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. | ||
120 | */ | ||
121 | void | ||
122 | nfs_start_io_direct(struct inode *inode) | ||
123 | { | ||
124 | struct nfs_inode *nfsi = NFS_I(inode); | ||
125 | /* Be an optimist! */ | ||
126 | down_read(&inode->i_rwsem); | ||
127 | if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) | ||
128 | return; | ||
129 | up_read(&inode->i_rwsem); | ||
130 | /* Slow path.... */ | ||
131 | down_write(&inode->i_rwsem); | ||
132 | nfs_block_buffered(nfsi, inode); | ||
133 | downgrade_write(&inode->i_rwsem); | ||
134 | } | ||
135 | |||
136 | /** | ||
137 | * nfs_end_io_direct - declare that the direct i/o operation is done | ||
138 | * @inode - file inode | ||
139 | * | ||
140 | * Declare that a direct I/O operation is done, and release the shared | ||
141 | * lock on inode->i_rwsem. | ||
142 | */ | ||
143 | void | ||
144 | nfs_end_io_direct(struct inode *inode) | ||
145 | { | ||
146 | up_read(&inode->i_rwsem); | ||
147 | } | ||
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 9e9fa347a948..ee753547fb0a 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c | |||
@@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, | |||
76 | * low timeout interval so that if a connection is lost, we retry through | 76 | * low timeout interval so that if a connection is lost, we retry through |
77 | * the MDS. | 77 | * the MDS. |
78 | */ | 78 | */ |
79 | struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, | 79 | struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, |
80 | const struct sockaddr *ds_addr, int ds_addrlen, | 80 | const struct sockaddr *ds_addr, int ds_addrlen, |
81 | int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, | 81 | int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, |
82 | rpc_authflavor_t au_flavor) | 82 | rpc_authflavor_t au_flavor) |
83 | { | 83 | { |
84 | struct rpc_timeout ds_timeout; | ||
85 | struct nfs_client *mds_clp = mds_srv->nfs_client; | ||
84 | struct nfs_client_initdata cl_init = { | 86 | struct nfs_client_initdata cl_init = { |
85 | .addr = ds_addr, | 87 | .addr = ds_addr, |
86 | .addrlen = ds_addrlen, | 88 | .addrlen = ds_addrlen, |
89 | .nodename = mds_clp->cl_rpcclient->cl_nodename, | ||
90 | .ip_addr = mds_clp->cl_ipaddr, | ||
87 | .nfs_mod = &nfs_v3, | 91 | .nfs_mod = &nfs_v3, |
88 | .proto = ds_proto, | 92 | .proto = ds_proto, |
89 | .net = mds_clp->cl_net, | 93 | .net = mds_clp->cl_net, |
94 | .timeparms = &ds_timeout, | ||
90 | }; | 95 | }; |
91 | struct rpc_timeout ds_timeout; | ||
92 | struct nfs_client *clp; | 96 | struct nfs_client *clp; |
93 | char buf[INET6_ADDRSTRLEN + 1]; | 97 | char buf[INET6_ADDRSTRLEN + 1]; |
94 | 98 | ||
@@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, | |||
97 | return ERR_PTR(-EINVAL); | 101 | return ERR_PTR(-EINVAL); |
98 | cl_init.hostname = buf; | 102 | cl_init.hostname = buf; |
99 | 103 | ||
104 | if (mds_srv->flags & NFS_MOUNT_NORESVPORT) | ||
105 | set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); | ||
106 | |||
100 | /* Use the MDS nfs_client cl_ipaddr. */ | 107 | /* Use the MDS nfs_client cl_ipaddr. */ |
101 | nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); | 108 | nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); |
102 | clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, | 109 | clp = nfs_get_client(&cl_init, au_flavor); |
103 | au_flavor); | ||
104 | 110 | ||
105 | return clp; | 111 | return clp; |
106 | } | 112 | } |
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index aa03ed09ba06..33da841a21bb 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c | |||
@@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) | |||
113 | if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) | 113 | if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) |
114 | return -EOPNOTSUPP; | 114 | return -EOPNOTSUPP; |
115 | 115 | ||
116 | nfs_wb_all(inode); | ||
117 | inode_lock(inode); | 116 | inode_lock(inode); |
117 | err = nfs_sync_inode(inode); | ||
118 | if (err) | ||
119 | goto out_unlock; | ||
118 | 120 | ||
119 | err = nfs42_proc_fallocate(&msg, filep, offset, len); | 121 | err = nfs42_proc_fallocate(&msg, filep, offset, len); |
120 | if (err == 0) | 122 | if (err == 0) |
121 | truncate_pagecache_range(inode, offset, (offset + len) -1); | 123 | truncate_pagecache_range(inode, offset, (offset + len) -1); |
122 | if (err == -EOPNOTSUPP) | 124 | if (err == -EOPNOTSUPP) |
123 | NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; | 125 | NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; |
124 | 126 | out_unlock: | |
125 | inode_unlock(inode); | 127 | inode_unlock(inode); |
126 | return err; | 128 | return err; |
127 | } | 129 | } |
@@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, | |||
154 | if (status) | 156 | if (status) |
155 | return status; | 157 | return status; |
156 | 158 | ||
159 | status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, | ||
160 | pos_src, pos_src + (loff_t)count - 1); | ||
161 | if (status) | ||
162 | return status; | ||
163 | |||
157 | status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, | 164 | status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, |
158 | dst_lock, FMODE_WRITE); | 165 | dst_lock, FMODE_WRITE); |
159 | if (status) | 166 | if (status) |
160 | return status; | 167 | return status; |
161 | 168 | ||
169 | status = nfs_sync_inode(dst_inode); | ||
170 | if (status) | ||
171 | return status; | ||
172 | |||
162 | status = nfs4_call_sync(server->client, server, &msg, | 173 | status = nfs4_call_sync(server->client, server, &msg, |
163 | &args.seq_args, &res.seq_res, 0); | 174 | &args.seq_args, &res.seq_res, 0); |
164 | if (status == -ENOTSUPP) | 175 | if (status == -ENOTSUPP) |
@@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep, | |||
258 | if (status) | 269 | if (status) |
259 | return status; | 270 | return status; |
260 | 271 | ||
261 | nfs_wb_all(inode); | 272 | status = nfs_filemap_write_and_wait_range(inode->i_mapping, |
273 | offset, LLONG_MAX); | ||
274 | if (status) | ||
275 | return status; | ||
276 | |||
262 | status = nfs4_call_sync(server->client, server, &msg, | 277 | status = nfs4_call_sync(server->client, server, &msg, |
263 | &args.seq_args, &res.seq_res, 0); | 278 | &args.seq_args, &res.seq_res, 0); |
264 | if (status == -ENOTSUPP) | 279 | if (status == -ENOTSUPP) |
@@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) | |||
336 | * Mark the bad layout state as invalid, then retry | 351 | * Mark the bad layout state as invalid, then retry |
337 | * with the current stateid. | 352 | * with the current stateid. |
338 | */ | 353 | */ |
339 | set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); | 354 | pnfs_mark_layout_stateid_invalid(lo, &head); |
340 | pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); | ||
341 | spin_unlock(&inode->i_lock); | 355 | spin_unlock(&inode->i_lock); |
342 | pnfs_free_lseg_list(&head); | 356 | pnfs_free_lseg_list(&head); |
343 | } else | 357 | } else |
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6dc6f2aea0d6..8b2605882a20 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c | |||
@@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr, | |||
330 | struct nfs42_write_res *res) | 330 | struct nfs42_write_res *res) |
331 | { | 331 | { |
332 | __be32 *p; | 332 | __be32 *p; |
333 | int stateids; | ||
334 | 333 | ||
335 | p = xdr_inline_decode(xdr, 4 + 8 + 4); | 334 | p = xdr_inline_decode(xdr, 4 + 8 + 4); |
336 | if (unlikely(!p)) | 335 | if (unlikely(!p)) |
337 | goto out_overflow; | 336 | goto out_overflow; |
338 | 337 | ||
339 | stateids = be32_to_cpup(p++); | 338 | /* |
339 | * We never use asynchronous mode, so warn if a server returns | ||
340 | * a stateid. | ||
341 | */ | ||
342 | if (unlikely(*p != 0)) { | ||
343 | pr_err_once("%s: server has set unrequested " | ||
344 | "asynchronous mode\n", __func__); | ||
345 | return -EREMOTEIO; | ||
346 | } | ||
347 | p++; | ||
340 | p = xdr_decode_hyper(p, &res->count); | 348 | p = xdr_decode_hyper(p, &res->count); |
341 | res->verifier.committed = be32_to_cpup(p); | 349 | res->verifier.committed = be32_to_cpup(p); |
342 | return decode_verifier(xdr, &res->verifier.verifier); | 350 | return decode_verifier(xdr, &res->verifier.verifier); |
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 768456fa1b17..4be567a54958 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h | |||
@@ -185,6 +185,7 @@ struct nfs4_state { | |||
185 | struct nfs4_exception { | 185 | struct nfs4_exception { |
186 | struct nfs4_state *state; | 186 | struct nfs4_state *state; |
187 | struct inode *inode; | 187 | struct inode *inode; |
188 | nfs4_stateid *stateid; | ||
188 | long timeout; | 189 | long timeout; |
189 | unsigned char delay : 1, | 190 | unsigned char delay : 1, |
190 | recovering : 1, | 191 | recovering : 1, |
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 10410e8b5853..8d7d08d4f95f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c | |||
@@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) | |||
349 | * Returns pointer to an NFS client, or an ERR_PTR value. | 349 | * Returns pointer to an NFS client, or an ERR_PTR value. |
350 | */ | 350 | */ |
351 | struct nfs_client *nfs4_init_client(struct nfs_client *clp, | 351 | struct nfs_client *nfs4_init_client(struct nfs_client *clp, |
352 | const struct rpc_timeout *timeparms, | 352 | const struct nfs_client_initdata *cl_init) |
353 | const char *ip_addr) | ||
354 | { | 353 | { |
355 | char buf[INET6_ADDRSTRLEN + 1]; | 354 | char buf[INET6_ADDRSTRLEN + 1]; |
355 | const char *ip_addr = cl_init->ip_addr; | ||
356 | struct nfs_client *old; | 356 | struct nfs_client *old; |
357 | int error; | 357 | int error; |
358 | 358 | ||
@@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, | |||
370 | __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); | 370 | __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); |
371 | __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); | 371 | __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); |
372 | 372 | ||
373 | error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); | 373 | error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); |
374 | if (error == -EINVAL) | 374 | if (error == -EINVAL) |
375 | error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); | 375 | error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); |
376 | if (error < 0) | 376 | if (error < 0) |
377 | goto error; | 377 | goto error; |
378 | 378 | ||
@@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server, | |||
793 | .hostname = hostname, | 793 | .hostname = hostname, |
794 | .addr = addr, | 794 | .addr = addr, |
795 | .addrlen = addrlen, | 795 | .addrlen = addrlen, |
796 | .ip_addr = ip_addr, | ||
796 | .nfs_mod = &nfs_v4, | 797 | .nfs_mod = &nfs_v4, |
797 | .proto = proto, | 798 | .proto = proto, |
798 | .minorversion = minorversion, | 799 | .minorversion = minorversion, |
799 | .net = net, | 800 | .net = net, |
801 | .timeparms = timeparms, | ||
800 | }; | 802 | }; |
801 | struct nfs_client *clp; | 803 | struct nfs_client *clp; |
802 | int error; | 804 | int error; |
@@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server, | |||
809 | set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); | 811 | set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); |
810 | 812 | ||
811 | /* Allocate or find a client reference we can use */ | 813 | /* Allocate or find a client reference we can use */ |
812 | clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); | 814 | clp = nfs_get_client(&cl_init, authflavour); |
813 | if (IS_ERR(clp)) { | 815 | if (IS_ERR(clp)) { |
814 | error = PTR_ERR(clp); | 816 | error = PTR_ERR(clp); |
815 | goto error; | 817 | goto error; |
@@ -842,20 +844,24 @@ error: | |||
842 | * low timeout interval so that if a connection is lost, we retry through | 844 | * low timeout interval so that if a connection is lost, we retry through |
843 | * the MDS. | 845 | * the MDS. |
844 | */ | 846 | */ |
845 | struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, | 847 | struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, |
846 | const struct sockaddr *ds_addr, int ds_addrlen, | 848 | const struct sockaddr *ds_addr, int ds_addrlen, |
847 | int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, | 849 | int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, |
848 | u32 minor_version, rpc_authflavor_t au_flavor) | 850 | u32 minor_version, rpc_authflavor_t au_flavor) |
849 | { | 851 | { |
852 | struct rpc_timeout ds_timeout; | ||
853 | struct nfs_client *mds_clp = mds_srv->nfs_client; | ||
850 | struct nfs_client_initdata cl_init = { | 854 | struct nfs_client_initdata cl_init = { |
851 | .addr = ds_addr, | 855 | .addr = ds_addr, |
852 | .addrlen = ds_addrlen, | 856 | .addrlen = ds_addrlen, |
857 | .nodename = mds_clp->cl_rpcclient->cl_nodename, | ||
858 | .ip_addr = mds_clp->cl_ipaddr, | ||
853 | .nfs_mod = &nfs_v4, | 859 | .nfs_mod = &nfs_v4, |
854 | .proto = ds_proto, | 860 | .proto = ds_proto, |
855 | .minorversion = minor_version, | 861 | .minorversion = minor_version, |
856 | .net = mds_clp->cl_net, | 862 | .net = mds_clp->cl_net, |
863 | .timeparms = &ds_timeout, | ||
857 | }; | 864 | }; |
858 | struct rpc_timeout ds_timeout; | ||
859 | struct nfs_client *clp; | 865 | struct nfs_client *clp; |
860 | char buf[INET6_ADDRSTRLEN + 1]; | 866 | char buf[INET6_ADDRSTRLEN + 1]; |
861 | 867 | ||
@@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, | |||
863 | return ERR_PTR(-EINVAL); | 869 | return ERR_PTR(-EINVAL); |
864 | cl_init.hostname = buf; | 870 | cl_init.hostname = buf; |
865 | 871 | ||
872 | if (mds_srv->flags & NFS_MOUNT_NORESVPORT) | ||
873 | __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); | ||
874 | |||
866 | /* | 875 | /* |
867 | * Set an authflavor equual to the MDS value. Use the MDS nfs_client | 876 | * Set an authflavor equual to the MDS value. Use the MDS nfs_client |
868 | * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS | 877 | * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS |
869 | * (section 13.1 RFC 5661). | 878 | * (section 13.1 RFC 5661). |
870 | */ | 879 | */ |
871 | nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); | 880 | nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); |
872 | clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, | 881 | clp = nfs_get_client(&cl_init, au_flavor); |
873 | au_flavor); | ||
874 | 882 | ||
875 | dprintk("<-- %s %p\n", __func__, clp); | 883 | dprintk("<-- %s %p\n", __func__, clp); |
876 | return clp; | 884 | return clp; |
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 014b0e41ace5..d085ad794884 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c | |||
@@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) | |||
66 | if (openflags & O_TRUNC) { | 66 | if (openflags & O_TRUNC) { |
67 | attr.ia_valid |= ATTR_SIZE; | 67 | attr.ia_valid |= ATTR_SIZE; |
68 | attr.ia_size = 0; | 68 | attr.ia_size = 0; |
69 | nfs_sync_inode(inode); | 69 | filemap_write_and_wait(inode->i_mapping); |
70 | } | 70 | } |
71 | 71 | ||
72 | inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); | 72 | inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); |
@@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, | |||
133 | struct file *file_out, loff_t pos_out, | 133 | struct file *file_out, loff_t pos_out, |
134 | size_t count, unsigned int flags) | 134 | size_t count, unsigned int flags) |
135 | { | 135 | { |
136 | struct inode *in_inode = file_inode(file_in); | 136 | if (file_inode(file_in) == file_inode(file_out)) |
137 | struct inode *out_inode = file_inode(file_out); | ||
138 | int ret; | ||
139 | |||
140 | if (in_inode == out_inode) | ||
141 | return -EINVAL; | 137 | return -EINVAL; |
142 | 138 | ||
143 | /* flush any pending writes */ | ||
144 | ret = nfs_sync_inode(in_inode); | ||
145 | if (ret) | ||
146 | return ret; | ||
147 | ret = nfs_sync_inode(out_inode); | ||
148 | if (ret) | ||
149 | return ret; | ||
150 | |||
151 | return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); | 139 | return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); |
152 | } | 140 | } |
153 | 141 | ||
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff416d0e24bc..da5c9e58e907 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c | |||
@@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, | |||
363 | { | 363 | { |
364 | struct nfs_client *clp = server->nfs_client; | 364 | struct nfs_client *clp = server->nfs_client; |
365 | struct nfs4_state *state = exception->state; | 365 | struct nfs4_state *state = exception->state; |
366 | const nfs4_stateid *stateid = exception->stateid; | ||
366 | struct inode *inode = exception->inode; | 367 | struct inode *inode = exception->inode; |
367 | int ret = errorcode; | 368 | int ret = errorcode; |
368 | 369 | ||
@@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server, | |||
376 | case -NFS4ERR_DELEG_REVOKED: | 377 | case -NFS4ERR_DELEG_REVOKED: |
377 | case -NFS4ERR_ADMIN_REVOKED: | 378 | case -NFS4ERR_ADMIN_REVOKED: |
378 | case -NFS4ERR_BAD_STATEID: | 379 | case -NFS4ERR_BAD_STATEID: |
379 | if (inode && nfs_async_inode_return_delegation(inode, | 380 | if (inode) { |
380 | NULL) == 0) | 381 | int err; |
381 | goto wait_on_recovery; | 382 | |
383 | err = nfs_async_inode_return_delegation(inode, | ||
384 | stateid); | ||
385 | if (err == 0) | ||
386 | goto wait_on_recovery; | ||
387 | if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) { | ||
388 | exception->retry = 1; | ||
389 | break; | ||
390 | } | ||
391 | } | ||
382 | if (state == NULL) | 392 | if (state == NULL) |
383 | break; | 393 | break; |
384 | ret = nfs4_schedule_stateid_recovery(server, state); | 394 | ret = nfs4_schedule_stateid_recovery(server, state); |
@@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, | |||
427 | case -NFS4ERR_DELAY: | 437 | case -NFS4ERR_DELAY: |
428 | nfs_inc_server_stats(server, NFSIOS_DELAY); | 438 | nfs_inc_server_stats(server, NFSIOS_DELAY); |
429 | case -NFS4ERR_GRACE: | 439 | case -NFS4ERR_GRACE: |
440 | case -NFS4ERR_LAYOUTTRYLATER: | ||
430 | case -NFS4ERR_RECALLCONFLICT: | 441 | case -NFS4ERR_RECALLCONFLICT: |
431 | exception->delay = 1; | 442 | exception->delay = 1; |
432 | return 0; | 443 | return 0; |
@@ -2669,28 +2680,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, | |||
2669 | return res; | 2680 | return res; |
2670 | } | 2681 | } |
2671 | 2682 | ||
2672 | static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, | 2683 | static int _nfs4_do_setattr(struct inode *inode, |
2673 | struct nfs_fattr *fattr, struct iattr *sattr, | 2684 | struct nfs_setattrargs *arg, |
2674 | struct nfs4_state *state, struct nfs4_label *ilabel, | 2685 | struct nfs_setattrres *res, |
2675 | struct nfs4_label *olabel) | 2686 | struct rpc_cred *cred, |
2687 | struct nfs4_state *state) | ||
2676 | { | 2688 | { |
2677 | struct nfs_server *server = NFS_SERVER(inode); | 2689 | struct nfs_server *server = NFS_SERVER(inode); |
2678 | struct nfs_setattrargs arg = { | ||
2679 | .fh = NFS_FH(inode), | ||
2680 | .iap = sattr, | ||
2681 | .server = server, | ||
2682 | .bitmask = server->attr_bitmask, | ||
2683 | .label = ilabel, | ||
2684 | }; | ||
2685 | struct nfs_setattrres res = { | ||
2686 | .fattr = fattr, | ||
2687 | .label = olabel, | ||
2688 | .server = server, | ||
2689 | }; | ||
2690 | struct rpc_message msg = { | 2690 | struct rpc_message msg = { |
2691 | .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], | 2691 | .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], |
2692 | .rpc_argp = &arg, | 2692 | .rpc_argp = arg, |
2693 | .rpc_resp = &res, | 2693 | .rpc_resp = res, |
2694 | .rpc_cred = cred, | 2694 | .rpc_cred = cred, |
2695 | }; | 2695 | }; |
2696 | struct rpc_cred *delegation_cred = NULL; | 2696 | struct rpc_cred *delegation_cred = NULL; |
@@ -2699,17 +2699,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, | |||
2699 | bool truncate; | 2699 | bool truncate; |
2700 | int status; | 2700 | int status; |
2701 | 2701 | ||
2702 | arg.bitmask = nfs4_bitmask(server, ilabel); | 2702 | nfs_fattr_init(res->fattr); |
2703 | if (ilabel) | ||
2704 | arg.bitmask = nfs4_bitmask(server, olabel); | ||
2705 | |||
2706 | nfs_fattr_init(fattr); | ||
2707 | 2703 | ||
2708 | /* Servers should only apply open mode checks for file size changes */ | 2704 | /* Servers should only apply open mode checks for file size changes */ |
2709 | truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; | 2705 | truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; |
2710 | fmode = truncate ? FMODE_WRITE : FMODE_READ; | 2706 | fmode = truncate ? FMODE_WRITE : FMODE_READ; |
2711 | 2707 | ||
2712 | if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { | 2708 | if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { |
2713 | /* Use that stateid */ | 2709 | /* Use that stateid */ |
2714 | } else if (truncate && state != NULL) { | 2710 | } else if (truncate && state != NULL) { |
2715 | struct nfs_lockowner lockowner = { | 2711 | struct nfs_lockowner lockowner = { |
@@ -2719,19 +2715,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, | |||
2719 | if (!nfs4_valid_open_stateid(state)) | 2715 | if (!nfs4_valid_open_stateid(state)) |
2720 | return -EBADF; | 2716 | return -EBADF; |
2721 | if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, | 2717 | if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, |
2722 | &arg.stateid, &delegation_cred) == -EIO) | 2718 | &arg->stateid, &delegation_cred) == -EIO) |
2723 | return -EBADF; | 2719 | return -EBADF; |
2724 | } else | 2720 | } else |
2725 | nfs4_stateid_copy(&arg.stateid, &zero_stateid); | 2721 | nfs4_stateid_copy(&arg->stateid, &zero_stateid); |
2726 | if (delegation_cred) | 2722 | if (delegation_cred) |
2727 | msg.rpc_cred = delegation_cred; | 2723 | msg.rpc_cred = delegation_cred; |
2728 | 2724 | ||
2729 | status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); | 2725 | status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); |
2730 | 2726 | ||
2731 | put_rpccred(delegation_cred); | 2727 | put_rpccred(delegation_cred); |
2732 | if (status == 0 && state != NULL) | 2728 | if (status == 0 && state != NULL) |
2733 | renew_lease(server, timestamp); | 2729 | renew_lease(server, timestamp); |
2734 | trace_nfs4_setattr(inode, &arg.stateid, status); | 2730 | trace_nfs4_setattr(inode, &arg->stateid, status); |
2735 | return status; | 2731 | return status; |
2736 | } | 2732 | } |
2737 | 2733 | ||
@@ -2741,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, | |||
2741 | struct nfs4_label *olabel) | 2737 | struct nfs4_label *olabel) |
2742 | { | 2738 | { |
2743 | struct nfs_server *server = NFS_SERVER(inode); | 2739 | struct nfs_server *server = NFS_SERVER(inode); |
2740 | struct nfs_setattrargs arg = { | ||
2741 | .fh = NFS_FH(inode), | ||
2742 | .iap = sattr, | ||
2743 | .server = server, | ||
2744 | .bitmask = server->attr_bitmask, | ||
2745 | .label = ilabel, | ||
2746 | }; | ||
2747 | struct nfs_setattrres res = { | ||
2748 | .fattr = fattr, | ||
2749 | .label = olabel, | ||
2750 | .server = server, | ||
2751 | }; | ||
2744 | struct nfs4_exception exception = { | 2752 | struct nfs4_exception exception = { |
2745 | .state = state, | 2753 | .state = state, |
2746 | .inode = inode, | 2754 | .inode = inode, |
2755 | .stateid = &arg.stateid, | ||
2747 | }; | 2756 | }; |
2748 | int err; | 2757 | int err; |
2758 | |||
2759 | arg.bitmask = nfs4_bitmask(server, ilabel); | ||
2760 | if (ilabel) | ||
2761 | arg.bitmask = nfs4_bitmask(server, olabel); | ||
2762 | |||
2749 | do { | 2763 | do { |
2750 | err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); | 2764 | err = _nfs4_do_setattr(inode, &arg, &res, cred, state); |
2751 | switch (err) { | 2765 | switch (err) { |
2752 | case -NFS4ERR_OPENMODE: | 2766 | case -NFS4ERR_OPENMODE: |
2753 | if (!(sattr->ia_valid & ATTR_SIZE)) { | 2767 | if (!(sattr->ia_valid & ATTR_SIZE)) { |
@@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, | |||
3267 | return status; | 3281 | return status; |
3268 | } | 3282 | } |
3269 | 3283 | ||
3270 | static int nfs4_do_find_root_sec(struct nfs_server *server, | ||
3271 | struct nfs_fh *fhandle, struct nfs_fsinfo *info) | ||
3272 | { | ||
3273 | int mv = server->nfs_client->cl_minorversion; | ||
3274 | return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); | ||
3275 | } | ||
3276 | |||
3277 | /** | 3284 | /** |
3278 | * nfs4_proc_get_rootfh - get file handle for server's pseudoroot | 3285 | * nfs4_proc_get_rootfh - get file handle for server's pseudoroot |
3279 | * @server: initialized nfs_server handle | 3286 | * @server: initialized nfs_server handle |
@@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, | |||
3293 | status = nfs4_lookup_root(server, fhandle, info); | 3300 | status = nfs4_lookup_root(server, fhandle, info); |
3294 | 3301 | ||
3295 | if (auth_probe || status == NFS4ERR_WRONGSEC) | 3302 | if (auth_probe || status == NFS4ERR_WRONGSEC) |
3296 | status = nfs4_do_find_root_sec(server, fhandle, info); | 3303 | status = server->nfs_client->cl_mvops->find_root_sec(server, |
3304 | fhandle, info); | ||
3297 | 3305 | ||
3298 | if (status == 0) | 3306 | if (status == 0) |
3299 | status = nfs4_server_capabilities(server, fhandle); | 3307 | status = nfs4_server_capabilities(server, fhandle); |
@@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, | |||
4392 | struct rpc_message *msg) | 4400 | struct rpc_message *msg) |
4393 | { | 4401 | { |
4394 | hdr->timestamp = jiffies; | 4402 | hdr->timestamp = jiffies; |
4395 | hdr->pgio_done_cb = nfs4_read_done_cb; | 4403 | if (!hdr->pgio_done_cb) |
4404 | hdr->pgio_done_cb = nfs4_read_done_cb; | ||
4396 | msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; | 4405 | msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; |
4397 | nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); | 4406 | nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); |
4398 | } | 4407 | } |
@@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, | |||
7869 | struct inode *inode = lgp->args.inode; | 7878 | struct inode *inode = lgp->args.inode; |
7870 | struct nfs_server *server = NFS_SERVER(inode); | 7879 | struct nfs_server *server = NFS_SERVER(inode); |
7871 | struct pnfs_layout_hdr *lo; | 7880 | struct pnfs_layout_hdr *lo; |
7872 | int status = task->tk_status; | 7881 | int nfs4err = task->tk_status; |
7882 | int err, status = 0; | ||
7883 | LIST_HEAD(head); | ||
7873 | 7884 | ||
7874 | dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); | 7885 | dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); |
7875 | 7886 | ||
7876 | switch (status) { | 7887 | switch (nfs4err) { |
7877 | case 0: | 7888 | case 0: |
7878 | goto out; | 7889 | goto out; |
7879 | 7890 | ||
@@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, | |||
7905 | status = -EOVERFLOW; | 7916 | status = -EOVERFLOW; |
7906 | goto out; | 7917 | goto out; |
7907 | } | 7918 | } |
7908 | /* Fallthrough */ | 7919 | status = -EBUSY; |
7920 | break; | ||
7909 | case -NFS4ERR_RECALLCONFLICT: | 7921 | case -NFS4ERR_RECALLCONFLICT: |
7910 | nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, | ||
7911 | exception); | ||
7912 | status = -ERECALLCONFLICT; | 7922 | status = -ERECALLCONFLICT; |
7913 | goto out; | 7923 | break; |
7914 | case -NFS4ERR_EXPIRED: | 7924 | case -NFS4ERR_EXPIRED: |
7915 | case -NFS4ERR_BAD_STATEID: | 7925 | case -NFS4ERR_BAD_STATEID: |
7916 | exception->timeout = 0; | 7926 | exception->timeout = 0; |
7917 | spin_lock(&inode->i_lock); | 7927 | spin_lock(&inode->i_lock); |
7918 | if (nfs4_stateid_match(&lgp->args.stateid, | 7928 | lo = NFS_I(inode)->layout; |
7929 | /* If the open stateid was bad, then recover it. */ | ||
7930 | if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || | ||
7931 | nfs4_stateid_match_other(&lgp->args.stateid, | ||
7919 | &lgp->args.ctx->state->stateid)) { | 7932 | &lgp->args.ctx->state->stateid)) { |
7920 | spin_unlock(&inode->i_lock); | 7933 | spin_unlock(&inode->i_lock); |
7921 | /* If the open stateid was bad, then recover it. */ | ||
7922 | exception->state = lgp->args.ctx->state; | 7934 | exception->state = lgp->args.ctx->state; |
7923 | break; | 7935 | break; |
7924 | } | 7936 | } |
7925 | lo = NFS_I(inode)->layout; | ||
7926 | if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) && | ||
7927 | nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { | ||
7928 | LIST_HEAD(head); | ||
7929 | |||
7930 | /* | ||
7931 | * Mark the bad layout state as invalid, then retry | ||
7932 | * with the current stateid. | ||
7933 | */ | ||
7934 | set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); | ||
7935 | pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); | ||
7936 | spin_unlock(&inode->i_lock); | ||
7937 | pnfs_free_lseg_list(&head); | ||
7938 | status = -EAGAIN; | ||
7939 | goto out; | ||
7940 | } else | ||
7941 | spin_unlock(&inode->i_lock); | ||
7942 | } | ||
7943 | 7937 | ||
7944 | status = nfs4_handle_exception(server, status, exception); | 7938 | /* |
7945 | if (exception->retry) | 7939 | * Mark the bad layout state as invalid, then retry |
7940 | */ | ||
7941 | pnfs_mark_layout_stateid_invalid(lo, &head); | ||
7942 | spin_unlock(&inode->i_lock); | ||
7943 | pnfs_free_lseg_list(&head); | ||
7946 | status = -EAGAIN; | 7944 | status = -EAGAIN; |
7945 | goto out; | ||
7946 | } | ||
7947 | |||
7948 | err = nfs4_handle_exception(server, nfs4err, exception); | ||
7949 | if (!status) { | ||
7950 | if (exception->retry) | ||
7951 | status = -EAGAIN; | ||
7952 | else | ||
7953 | status = err; | ||
7954 | } | ||
7947 | out: | 7955 | out: |
7948 | dprintk("<-- %s\n", __func__); | 7956 | dprintk("<-- %s\n", __func__); |
7949 | return status; | 7957 | return status; |
@@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata) | |||
8129 | spin_lock(&lo->plh_inode->i_lock); | 8137 | spin_lock(&lo->plh_inode->i_lock); |
8130 | pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, | 8138 | pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, |
8131 | be32_to_cpu(lrp->args.stateid.seqid)); | 8139 | be32_to_cpu(lrp->args.stateid.seqid)); |
8132 | pnfs_mark_layout_returned_if_empty(lo); | 8140 | if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) |
8133 | if (lrp->res.lrs_present) | ||
8134 | pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); | 8141 | pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); |
8135 | pnfs_clear_layoutreturn_waitbit(lo); | 8142 | pnfs_clear_layoutreturn_waitbit(lo); |
8136 | spin_unlock(&lo->plh_inode->i_lock); | 8143 | spin_unlock(&lo->plh_inode->i_lock); |
@@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { | |||
8835 | #endif | 8842 | #endif |
8836 | }; | 8843 | }; |
8837 | 8844 | ||
8838 | ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) | 8845 | static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) |
8839 | { | 8846 | { |
8840 | ssize_t error, error2; | 8847 | ssize_t error, error2; |
8841 | 8848 | ||
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 661e753fe1c9..7bd3a5c09d31 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c | |||
@@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr, | |||
1985 | p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ | 1985 | p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ |
1986 | *p = cpu_to_be32(0); /* reclaim */ | 1986 | *p = cpu_to_be32(0); /* reclaim */ |
1987 | encode_nfs4_stateid(xdr, &args->stateid); | 1987 | encode_nfs4_stateid(xdr, &args->stateid); |
1988 | p = reserve_space(xdr, 20); | 1988 | if (args->lastbytewritten != U64_MAX) { |
1989 | *p++ = cpu_to_be32(1); /* newoffset = TRUE */ | 1989 | p = reserve_space(xdr, 20); |
1990 | p = xdr_encode_hyper(p, args->lastbytewritten); | 1990 | *p++ = cpu_to_be32(1); /* newoffset = TRUE */ |
1991 | p = xdr_encode_hyper(p, args->lastbytewritten); | ||
1992 | } else { | ||
1993 | p = reserve_space(xdr, 12); | ||
1994 | *p++ = cpu_to_be32(0); /* newoffset = FALSE */ | ||
1995 | } | ||
1991 | *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ | 1996 | *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ |
1992 | *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ | 1997 | *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ |
1993 | 1998 | ||
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 31c7763b94d5..2ca9167bc97d 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h | |||
@@ -37,7 +37,6 @@ | |||
37 | { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ | 37 | { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ |
38 | { 1 << NFS_INO_STALE, "STALE" }, \ | 38 | { 1 << NFS_INO_STALE, "STALE" }, \ |
39 | { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ | 39 | { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ |
40 | { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ | ||
41 | { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ | 40 | { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ |
42 | { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ | 41 | { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ |
43 | { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) | 42 | { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) |
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0fbe734cc38c..70806cae0d36 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c | |||
@@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) | |||
259 | * is required. | 259 | * is required. |
260 | * Note that caller must hold inode->i_lock. | 260 | * Note that caller must hold inode->i_lock. |
261 | */ | 261 | */ |
262 | static int | 262 | int |
263 | pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, | 263 | pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, |
264 | struct list_head *lseg_list) | 264 | struct list_head *lseg_list) |
265 | { | 265 | { |
@@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) | |||
334 | } | 334 | } |
335 | 335 | ||
336 | static void | 336 | static void |
337 | init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) | 337 | pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, |
338 | const struct pnfs_layout_range *range, | ||
339 | const nfs4_stateid *stateid) | ||
338 | { | 340 | { |
339 | INIT_LIST_HEAD(&lseg->pls_list); | 341 | INIT_LIST_HEAD(&lseg->pls_list); |
340 | INIT_LIST_HEAD(&lseg->pls_lc_list); | 342 | INIT_LIST_HEAD(&lseg->pls_lc_list); |
341 | atomic_set(&lseg->pls_refcount, 1); | 343 | atomic_set(&lseg->pls_refcount, 1); |
342 | smp_mb(); | ||
343 | set_bit(NFS_LSEG_VALID, &lseg->pls_flags); | 344 | set_bit(NFS_LSEG_VALID, &lseg->pls_flags); |
344 | lseg->pls_layout = lo; | 345 | lseg->pls_layout = lo; |
346 | lseg->pls_range = *range; | ||
347 | lseg->pls_seq = be32_to_cpu(stateid->seqid); | ||
345 | } | 348 | } |
346 | 349 | ||
347 | static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) | 350 | static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) |
@@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, | |||
486 | (end2 == NFS4_MAX_UINT64 || end2 > start1); | 489 | (end2 == NFS4_MAX_UINT64 || end2 > start1); |
487 | } | 490 | } |
488 | 491 | ||
489 | static bool | ||
490 | should_free_lseg(const struct pnfs_layout_range *lseg_range, | ||
491 | const struct pnfs_layout_range *recall_range) | ||
492 | { | ||
493 | return (recall_range->iomode == IOMODE_ANY || | ||
494 | lseg_range->iomode == recall_range->iomode) && | ||
495 | pnfs_lseg_range_intersecting(lseg_range, recall_range); | ||
496 | } | ||
497 | |||
498 | static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, | 492 | static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, |
499 | struct list_head *tmp_list) | 493 | struct list_head *tmp_list) |
500 | { | 494 | { |
@@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) | |||
533 | return (s32)(s1 - s2) > 0; | 527 | return (s32)(s1 - s2) > 0; |
534 | } | 528 | } |
535 | 529 | ||
530 | static bool | ||
531 | pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, | ||
532 | const struct pnfs_layout_range *recall_range) | ||
533 | { | ||
534 | return (recall_range->iomode == IOMODE_ANY || | ||
535 | lseg_range->iomode == recall_range->iomode) && | ||
536 | pnfs_lseg_range_intersecting(lseg_range, recall_range); | ||
537 | } | ||
538 | |||
539 | static bool | ||
540 | pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg, | ||
541 | const struct pnfs_layout_range *recall_range, | ||
542 | u32 seq) | ||
543 | { | ||
544 | if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq)) | ||
545 | return false; | ||
546 | if (recall_range == NULL) | ||
547 | return true; | ||
548 | return pnfs_should_free_range(&lseg->pls_range, recall_range); | ||
549 | } | ||
550 | |||
536 | /** | 551 | /** |
537 | * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later | 552 | * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later |
538 | * @lo: layout header containing the lsegs | 553 | * @lo: layout header containing the lsegs |
@@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, | |||
562 | if (list_empty(&lo->plh_segs)) | 577 | if (list_empty(&lo->plh_segs)) |
563 | return 0; | 578 | return 0; |
564 | list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) | 579 | list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) |
565 | if (!recall_range || | 580 | if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { |
566 | should_free_lseg(&lseg->pls_range, recall_range)) { | ||
567 | if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) | ||
568 | continue; | ||
569 | dprintk("%s: freeing lseg %p iomode %d seq %u" | 581 | dprintk("%s: freeing lseg %p iomode %d seq %u" |
570 | "offset %llu length %llu\n", __func__, | 582 | "offset %llu length %llu\n", __func__, |
571 | lseg, lseg->pls_range.iomode, lseg->pls_seq, | 583 | lseg, lseg->pls_range.iomode, lseg->pls_seq, |
@@ -761,24 +773,25 @@ void | |||
761 | pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, | 773 | pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, |
762 | bool update_barrier) | 774 | bool update_barrier) |
763 | { | 775 | { |
764 | u32 oldseq, newseq, new_barrier; | 776 | u32 oldseq, newseq, new_barrier = 0; |
765 | int empty = list_empty(&lo->plh_segs); | 777 | bool invalid = !pnfs_layout_is_valid(lo); |
766 | 778 | ||
767 | oldseq = be32_to_cpu(lo->plh_stateid.seqid); | 779 | oldseq = be32_to_cpu(lo->plh_stateid.seqid); |
768 | newseq = be32_to_cpu(new->seqid); | 780 | newseq = be32_to_cpu(new->seqid); |
769 | if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { | 781 | if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { |
770 | nfs4_stateid_copy(&lo->plh_stateid, new); | 782 | nfs4_stateid_copy(&lo->plh_stateid, new); |
771 | if (update_barrier) { | 783 | /* |
772 | new_barrier = be32_to_cpu(new->seqid); | 784 | * Because of wraparound, we want to keep the barrier |
773 | } else { | 785 | * "close" to the current seqids. |
774 | /* Because of wraparound, we want to keep the barrier | 786 | */ |
775 | * "close" to the current seqids. | 787 | new_barrier = newseq - atomic_read(&lo->plh_outstanding); |
776 | */ | ||
777 | new_barrier = newseq - atomic_read(&lo->plh_outstanding); | ||
778 | } | ||
779 | if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) | ||
780 | lo->plh_barrier = new_barrier; | ||
781 | } | 788 | } |
789 | if (update_barrier) | ||
790 | new_barrier = be32_to_cpu(new->seqid); | ||
791 | else if (new_barrier == 0) | ||
792 | return; | ||
793 | if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) | ||
794 | lo->plh_barrier = new_barrier; | ||
782 | } | 795 | } |
783 | 796 | ||
784 | static bool | 797 | static bool |
@@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) | |||
873 | rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); | 886 | rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); |
874 | } | 887 | } |
875 | 888 | ||
889 | static void | ||
890 | pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) | ||
891 | { | ||
892 | lo->plh_return_iomode = 0; | ||
893 | lo->plh_return_seq = 0; | ||
894 | clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); | ||
895 | } | ||
896 | |||
876 | static bool | 897 | static bool |
877 | pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) | 898 | pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, |
899 | nfs4_stateid *stateid, | ||
900 | enum pnfs_iomode *iomode) | ||
878 | { | 901 | { |
879 | if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) | 902 | if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) |
880 | return false; | 903 | return false; |
881 | lo->plh_return_iomode = 0; | ||
882 | lo->plh_return_seq = 0; | ||
883 | pnfs_get_layout_hdr(lo); | 904 | pnfs_get_layout_hdr(lo); |
884 | clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); | 905 | if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { |
906 | if (stateid != NULL) { | ||
907 | nfs4_stateid_copy(stateid, &lo->plh_stateid); | ||
908 | if (lo->plh_return_seq != 0) | ||
909 | stateid->seqid = cpu_to_be32(lo->plh_return_seq); | ||
910 | } | ||
911 | if (iomode != NULL) | ||
912 | *iomode = lo->plh_return_iomode; | ||
913 | pnfs_clear_layoutreturn_info(lo); | ||
914 | return true; | ||
915 | } | ||
916 | if (stateid != NULL) | ||
917 | nfs4_stateid_copy(stateid, &lo->plh_stateid); | ||
918 | if (iomode != NULL) | ||
919 | *iomode = IOMODE_ANY; | ||
885 | return true; | 920 | return true; |
886 | } | 921 | } |
887 | 922 | ||
@@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) | |||
949 | enum pnfs_iomode iomode; | 984 | enum pnfs_iomode iomode; |
950 | bool send; | 985 | bool send; |
951 | 986 | ||
952 | nfs4_stateid_copy(&stateid, &lo->plh_stateid); | 987 | send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); |
953 | stateid.seqid = cpu_to_be32(lo->plh_return_seq); | ||
954 | iomode = lo->plh_return_iomode; | ||
955 | send = pnfs_prepare_layoutreturn(lo); | ||
956 | spin_unlock(&inode->i_lock); | 988 | spin_unlock(&inode->i_lock); |
957 | if (send) { | 989 | if (send) { |
958 | /* Send an async layoutreturn so we dont deadlock */ | 990 | /* Send an async layoutreturn so we dont deadlock */ |
@@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino) | |||
989 | dprintk("NFS: %s no layout to return\n", __func__); | 1021 | dprintk("NFS: %s no layout to return\n", __func__); |
990 | goto out; | 1022 | goto out; |
991 | } | 1023 | } |
992 | nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); | ||
993 | /* Reference matched in nfs4_layoutreturn_release */ | 1024 | /* Reference matched in nfs4_layoutreturn_release */ |
994 | pnfs_get_layout_hdr(lo); | 1025 | pnfs_get_layout_hdr(lo); |
995 | empty = list_empty(&lo->plh_segs); | 1026 | empty = list_empty(&lo->plh_segs); |
@@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino) | |||
1012 | goto out_put_layout_hdr; | 1043 | goto out_put_layout_hdr; |
1013 | } | 1044 | } |
1014 | 1045 | ||
1015 | set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); | 1046 | send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); |
1016 | send = pnfs_prepare_layoutreturn(lo); | ||
1017 | spin_unlock(&ino->i_lock); | 1047 | spin_unlock(&ino->i_lock); |
1018 | pnfs_free_lseg_list(&tmp_list); | 1048 | pnfs_free_lseg_list(&tmp_list); |
1019 | if (send) | 1049 | if (send) |
@@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino) | |||
1080 | goto out_noroc; | 1110 | goto out_noroc; |
1081 | } | 1111 | } |
1082 | 1112 | ||
1083 | nfs4_stateid_copy(&stateid, &lo->plh_stateid); | ||
1084 | /* always send layoutreturn if being marked so */ | 1113 | /* always send layoutreturn if being marked so */ |
1085 | if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, | 1114 | if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) |
1086 | &lo->plh_flags)) | 1115 | layoutreturn = pnfs_prepare_layoutreturn(lo, |
1087 | layoutreturn = pnfs_prepare_layoutreturn(lo); | 1116 | &stateid, NULL); |
1088 | 1117 | ||
1089 | list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) | 1118 | list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) |
1090 | /* If we are sending layoutreturn, invalidate all valid lsegs */ | 1119 | /* If we are sending layoutreturn, invalidate all valid lsegs */ |
@@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) | |||
1132 | 1161 | ||
1133 | spin_lock(&ino->i_lock); | 1162 | spin_lock(&ino->i_lock); |
1134 | lo = NFS_I(ino)->layout; | 1163 | lo = NFS_I(ino)->layout; |
1135 | pnfs_mark_layout_returned_if_empty(lo); | ||
1136 | if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) | 1164 | if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) |
1137 | lo->plh_barrier = barrier; | 1165 | lo->plh_barrier = barrier; |
1138 | spin_unlock(&ino->i_lock); | 1166 | spin_unlock(&ino->i_lock); |
@@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino, | |||
1505 | struct pnfs_layout_segment *lseg = NULL; | 1533 | struct pnfs_layout_segment *lseg = NULL; |
1506 | nfs4_stateid stateid; | 1534 | nfs4_stateid stateid; |
1507 | long timeout = 0; | 1535 | long timeout = 0; |
1508 | unsigned long giveup = jiffies + rpc_get_timeout(server->client); | 1536 | unsigned long giveup = jiffies + (clp->cl_lease_time << 1); |
1509 | bool first; | 1537 | bool first; |
1510 | 1538 | ||
1511 | if (!pnfs_enabled_sb(NFS_SERVER(ino))) { | 1539 | if (!pnfs_enabled_sb(NFS_SERVER(ino))) { |
@@ -1645,33 +1673,44 @@ lookup_again: | |||
1645 | lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); | 1673 | lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); |
1646 | trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, | 1674 | trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, |
1647 | PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); | 1675 | PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); |
1676 | atomic_dec(&lo->plh_outstanding); | ||
1648 | if (IS_ERR(lseg)) { | 1677 | if (IS_ERR(lseg)) { |
1649 | switch(PTR_ERR(lseg)) { | 1678 | switch(PTR_ERR(lseg)) { |
1650 | case -ERECALLCONFLICT: | 1679 | case -EBUSY: |
1651 | if (time_after(jiffies, giveup)) | 1680 | if (time_after(jiffies, giveup)) |
1652 | lseg = NULL; | 1681 | lseg = NULL; |
1653 | /* Fallthrough */ | 1682 | break; |
1654 | case -EAGAIN: | 1683 | case -ERECALLCONFLICT: |
1655 | pnfs_put_layout_hdr(lo); | 1684 | /* Huh? We hold no layouts, how is there a recall? */ |
1656 | if (first) | 1685 | if (first) { |
1657 | pnfs_clear_first_layoutget(lo); | 1686 | lseg = NULL; |
1658 | if (lseg) { | 1687 | break; |
1659 | trace_pnfs_update_layout(ino, pos, count, | ||
1660 | iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); | ||
1661 | goto lookup_again; | ||
1662 | } | 1688 | } |
1689 | /* Destroy the existing layout and start over */ | ||
1690 | if (time_after(jiffies, giveup)) | ||
1691 | pnfs_destroy_layout(NFS_I(ino)); | ||
1663 | /* Fallthrough */ | 1692 | /* Fallthrough */ |
1693 | case -EAGAIN: | ||
1694 | break; | ||
1664 | default: | 1695 | default: |
1665 | if (!nfs_error_is_fatal(PTR_ERR(lseg))) { | 1696 | if (!nfs_error_is_fatal(PTR_ERR(lseg))) { |
1666 | pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); | 1697 | pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); |
1667 | lseg = NULL; | 1698 | lseg = NULL; |
1668 | } | 1699 | } |
1700 | goto out_put_layout_hdr; | ||
1701 | } | ||
1702 | if (lseg) { | ||
1703 | if (first) | ||
1704 | pnfs_clear_first_layoutget(lo); | ||
1705 | trace_pnfs_update_layout(ino, pos, count, | ||
1706 | iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); | ||
1707 | pnfs_put_layout_hdr(lo); | ||
1708 | goto lookup_again; | ||
1669 | } | 1709 | } |
1670 | } else { | 1710 | } else { |
1671 | pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); | 1711 | pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); |
1672 | } | 1712 | } |
1673 | 1713 | ||
1674 | atomic_dec(&lo->plh_outstanding); | ||
1675 | out_put_layout_hdr: | 1714 | out_put_layout_hdr: |
1676 | if (first) | 1715 | if (first) |
1677 | pnfs_clear_first_layoutget(lo); | 1716 | pnfs_clear_first_layoutget(lo); |
@@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) | |||
1735 | return lseg; | 1774 | return lseg; |
1736 | } | 1775 | } |
1737 | 1776 | ||
1738 | init_lseg(lo, lseg); | 1777 | pnfs_init_lseg(lo, lseg, &res->range, &res->stateid); |
1739 | lseg->pls_range = res->range; | ||
1740 | lseg->pls_seq = be32_to_cpu(res->stateid.seqid); | ||
1741 | 1778 | ||
1742 | spin_lock(&ino->i_lock); | 1779 | spin_lock(&ino->i_lock); |
1743 | if (pnfs_layoutgets_blocked(lo)) { | 1780 | if (pnfs_layoutgets_blocked(lo)) { |
@@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) | |||
1758 | * inode invalid, and don't bother validating the stateid | 1795 | * inode invalid, and don't bother validating the stateid |
1759 | * sequence number. | 1796 | * sequence number. |
1760 | */ | 1797 | */ |
1761 | pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); | 1798 | pnfs_mark_layout_stateid_invalid(lo, &free_me); |
1762 | 1799 | ||
1763 | nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); | 1800 | nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); |
1764 | lo->plh_barrier = be32_to_cpu(res->stateid.seqid); | 1801 | lo->plh_barrier = be32_to_cpu(res->stateid.seqid); |
1765 | } | 1802 | } |
1766 | 1803 | ||
1767 | clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); | ||
1768 | |||
1769 | pnfs_get_lseg(lseg); | 1804 | pnfs_get_lseg(lseg); |
1770 | pnfs_layout_insert_lseg(lo, lseg, &free_me); | 1805 | pnfs_layout_insert_lseg(lo, lseg, &free_me); |
1806 | if (!pnfs_layout_is_valid(lo)) { | ||
1807 | pnfs_clear_layoutreturn_info(lo); | ||
1808 | clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); | ||
1809 | } | ||
1810 | |||
1771 | 1811 | ||
1772 | if (res->return_on_close) | 1812 | if (res->return_on_close) |
1773 | set_bit(NFS_LSEG_ROC, &lseg->pls_flags); | 1813 | set_bit(NFS_LSEG_ROC, &lseg->pls_flags); |
@@ -1787,14 +1827,14 @@ static void | |||
1787 | pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, | 1827 | pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, |
1788 | u32 seq) | 1828 | u32 seq) |
1789 | { | 1829 | { |
1790 | if (lo->plh_return_iomode == iomode) | 1830 | if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) |
1791 | return; | ||
1792 | if (lo->plh_return_iomode != 0) | ||
1793 | iomode = IOMODE_ANY; | 1831 | iomode = IOMODE_ANY; |
1794 | lo->plh_return_iomode = iomode; | 1832 | lo->plh_return_iomode = iomode; |
1795 | set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); | 1833 | set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); |
1796 | if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) | 1834 | if (seq != 0) { |
1835 | WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); | ||
1797 | lo->plh_return_seq = seq; | 1836 | lo->plh_return_seq = seq; |
1837 | } | ||
1798 | } | 1838 | } |
1799 | 1839 | ||
1800 | /** | 1840 | /** |
@@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, | |||
1824 | assert_spin_locked(&lo->plh_inode->i_lock); | 1864 | assert_spin_locked(&lo->plh_inode->i_lock); |
1825 | 1865 | ||
1826 | list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) | 1866 | list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) |
1827 | if (should_free_lseg(&lseg->pls_range, return_range)) { | 1867 | if (pnfs_match_lseg_recall(lseg, return_range, seq)) { |
1828 | dprintk("%s: marking lseg %p iomode %d " | 1868 | dprintk("%s: marking lseg %p iomode %d " |
1829 | "offset %llu length %llu\n", __func__, | 1869 | "offset %llu length %llu\n", __func__, |
1830 | lseg, lseg->pls_range.iomode, | 1870 | lseg, lseg->pls_range.iomode, |
@@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, | |||
1855 | bool return_now = false; | 1895 | bool return_now = false; |
1856 | 1896 | ||
1857 | spin_lock(&inode->i_lock); | 1897 | spin_lock(&inode->i_lock); |
1858 | pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); | 1898 | pnfs_set_plh_return_info(lo, range.iomode, 0); |
1859 | /* | 1899 | /* |
1860 | * mark all matching lsegs so that we are sure to have no live | 1900 | * mark all matching lsegs so that we are sure to have no live |
1861 | * segments at hand when sending layoutreturn. See pnfs_put_lseg() | 1901 | * segments at hand when sending layoutreturn. See pnfs_put_lseg() |
1862 | * for how it works. | 1902 | * for how it works. |
1863 | */ | 1903 | */ |
1864 | if (!pnfs_mark_matching_lsegs_return(lo, &free_me, | 1904 | if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { |
1865 | &range, lseg->pls_seq)) { | ||
1866 | nfs4_stateid stateid; | 1905 | nfs4_stateid stateid; |
1867 | enum pnfs_iomode iomode = lo->plh_return_iomode; | 1906 | enum pnfs_iomode iomode; |
1868 | 1907 | ||
1869 | nfs4_stateid_copy(&stateid, &lo->plh_stateid); | 1908 | return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); |
1870 | return_now = pnfs_prepare_layoutreturn(lo); | ||
1871 | spin_unlock(&inode->i_lock); | 1909 | spin_unlock(&inode->i_lock); |
1872 | if (return_now) | 1910 | if (return_now) |
1873 | pnfs_send_layoutreturn(lo, &stateid, iomode, false); | 1911 | pnfs_send_layoutreturn(lo, &stateid, iomode, false); |
@@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) | |||
2382 | nfs_fattr_init(&data->fattr); | 2420 | nfs_fattr_init(&data->fattr); |
2383 | data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; | 2421 | data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; |
2384 | data->res.fattr = &data->fattr; | 2422 | data->res.fattr = &data->fattr; |
2385 | data->args.lastbytewritten = end_pos - 1; | 2423 | if (end_pos != 0) |
2424 | data->args.lastbytewritten = end_pos - 1; | ||
2425 | else | ||
2426 | data->args.lastbytewritten = U64_MAX; | ||
2386 | data->res.server = NFS_SERVER(inode); | 2427 | data->res.server = NFS_SERVER(inode); |
2387 | 2428 | ||
2388 | if (ld->prepare_layoutcommit) { | 2429 | if (ld->prepare_layoutcommit) { |
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index b21bd0bee784..31d99b2927b0 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h | |||
@@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, | |||
268 | struct list_head *tmp_list, | 268 | struct list_head *tmp_list, |
269 | const struct pnfs_layout_range *recall_range, | 269 | const struct pnfs_layout_range *recall_range, |
270 | u32 seq); | 270 | u32 seq); |
271 | int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, | ||
272 | struct list_head *lseg_list); | ||
271 | bool pnfs_roc(struct inode *ino); | 273 | bool pnfs_roc(struct inode *ino); |
272 | void pnfs_roc_release(struct inode *ino); | 274 | void pnfs_roc_release(struct inode *ino); |
273 | void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); | 275 | void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); |
@@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode) | |||
375 | return NFS_I(inode)->layout != NULL; | 377 | return NFS_I(inode)->layout != NULL; |
376 | } | 378 | } |
377 | 379 | ||
380 | static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) | ||
381 | { | ||
382 | return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0; | ||
383 | } | ||
384 | |||
378 | static inline struct nfs4_deviceid_node * | 385 | static inline struct nfs4_deviceid_node * |
379 | nfs4_get_deviceid(struct nfs4_deviceid_node *d) | 386 | nfs4_get_deviceid(struct nfs4_deviceid_node *d) |
380 | { | 387 | { |
@@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end) | |||
545 | return 1 + end - offset; | 552 | return 1 + end - offset; |
546 | } | 553 | } |
547 | 554 | ||
548 | /** | ||
549 | * pnfs_mark_layout_returned_if_empty - marks the layout as returned | ||
550 | * @lo: layout header | ||
551 | * | ||
552 | * Note: Caller must hold inode->i_lock | ||
553 | */ | ||
554 | static inline void | ||
555 | pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) | ||
556 | { | ||
557 | if (list_empty(&lo->plh_segs)) | ||
558 | set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); | ||
559 | } | ||
560 | |||
561 | static inline void | 555 | static inline void |
562 | pnfs_copy_range(struct pnfs_layout_range *dst, | 556 | pnfs_copy_range(struct pnfs_layout_range *dst, |
563 | const struct pnfs_layout_range *src) | 557 | const struct pnfs_layout_range *src) |
@@ -629,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync) | |||
629 | } | 623 | } |
630 | 624 | ||
631 | static inline bool | 625 | static inline bool |
626 | pnfs_layoutcommit_outstanding(struct inode *inode) | ||
627 | { | ||
628 | return false; | ||
629 | } | ||
630 | |||
631 | |||
632 | static inline bool | ||
632 | pnfs_roc(struct inode *ino) | 633 | pnfs_roc(struct inode *ino) |
633 | { | 634 | { |
634 | return false; | 635 | return false; |
@@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, | |||
716 | return false; | 717 | return false; |
717 | } | 718 | } |
718 | 719 | ||
719 | static inline bool | ||
720 | pnfs_layoutcommit_outstanding(struct inode *inode) | ||
721 | { | ||
722 | return false; | ||
723 | } | ||
724 | |||
725 | |||
726 | static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) | 720 | static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) |
727 | { | 721 | { |
728 | return NULL; | 722 | return NULL; |
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index b38e3c0dc790..f3468b57a32a 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c | |||
@@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) | |||
595 | } | 595 | } |
596 | 596 | ||
597 | static struct nfs_client *(*get_v3_ds_connect)( | 597 | static struct nfs_client *(*get_v3_ds_connect)( |
598 | struct nfs_client *mds_clp, | 598 | struct nfs_server *mds_srv, |
599 | const struct sockaddr *ds_addr, | 599 | const struct sockaddr *ds_addr, |
600 | int ds_addrlen, | 600 | int ds_addrlen, |
601 | int ds_proto, | 601 | int ds_proto, |
@@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, | |||
654 | rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, | 654 | rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, |
655 | rpc_clnt_test_and_add_xprt, NULL); | 655 | rpc_clnt_test_and_add_xprt, NULL); |
656 | } else | 656 | } else |
657 | clp = get_v3_ds_connect(mds_srv->nfs_client, | 657 | clp = get_v3_ds_connect(mds_srv, |
658 | (struct sockaddr *)&da->da_addr, | 658 | (struct sockaddr *)&da->da_addr, |
659 | da->da_addrlen, IPPROTO_TCP, | 659 | da->da_addrlen, IPPROTO_TCP, |
660 | timeo, retrans, au_flavor); | 660 | timeo, retrans, au_flavor); |
@@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, | |||
690 | dprintk("%s: DS %s: trying address %s\n", | 690 | dprintk("%s: DS %s: trying address %s\n", |
691 | __func__, ds->ds_remotestr, da->da_remotestr); | 691 | __func__, ds->ds_remotestr, da->da_remotestr); |
692 | 692 | ||
693 | clp = nfs4_set_ds_client(mds_srv->nfs_client, | 693 | clp = nfs4_set_ds_client(mds_srv, |
694 | (struct sockaddr *)&da->da_addr, | 694 | (struct sockaddr *)&da->da_addr, |
695 | da->da_addrlen, IPPROTO_TCP, | 695 | da->da_addrlen, IPPROTO_TCP, |
696 | timeo, retrans, minor_version, | 696 | timeo, retrans, minor_version, |
@@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); | |||
940 | int | 940 | int |
941 | pnfs_nfs_generic_sync(struct inode *inode, bool datasync) | 941 | pnfs_nfs_generic_sync(struct inode *inode, bool datasync) |
942 | { | 942 | { |
943 | int ret; | ||
944 | |||
945 | if (!pnfs_layoutcommit_outstanding(inode)) | ||
946 | return 0; | ||
947 | ret = nfs_commit_inode(inode, FLUSH_SYNC); | ||
948 | if (ret < 0) | ||
949 | return ret; | ||
943 | if (datasync) | 950 | if (datasync) |
944 | return 0; | 951 | return 0; |
945 | return pnfs_layoutcommit_inode(inode, true); | 952 | return pnfs_layoutcommit_inode(inode, true); |
diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2137e0202f25..18d446e1a82b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c | |||
@@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, | |||
1684 | { | 1684 | { |
1685 | rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; | 1685 | rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; |
1686 | unsigned int i; | 1686 | unsigned int i; |
1687 | int use_auth_null = false; | ||
1687 | 1688 | ||
1688 | /* | 1689 | /* |
1689 | * If the sec= mount option is used, the specified flavor or AUTH_NULL | 1690 | * If the sec= mount option is used, the specified flavor or AUTH_NULL |
@@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, | |||
1691 | * | 1692 | * |
1692 | * AUTH_NULL has a special meaning when it's in the server list - it | 1693 | * AUTH_NULL has a special meaning when it's in the server list - it |
1693 | * means that the server will ignore the rpc creds, so any flavor | 1694 | * means that the server will ignore the rpc creds, so any flavor |
1694 | * can be used. | 1695 | * can be used but still use the sec= that was specified. |
1695 | */ | 1696 | */ |
1696 | for (i = 0; i < count; i++) { | 1697 | for (i = 0; i < count; i++) { |
1697 | flavor = server_authlist[i]; | 1698 | flavor = server_authlist[i]; |
1698 | 1699 | ||
1699 | if (nfs_auth_info_match(&args->auth_info, flavor) || | 1700 | if (nfs_auth_info_match(&args->auth_info, flavor)) |
1700 | flavor == RPC_AUTH_NULL) | ||
1701 | goto out; | 1701 | goto out; |
1702 | |||
1703 | if (flavor == RPC_AUTH_NULL) | ||
1704 | use_auth_null = true; | ||
1705 | } | ||
1706 | |||
1707 | if (use_auth_null) { | ||
1708 | flavor = RPC_AUTH_NULL; | ||
1709 | goto out; | ||
1702 | } | 1710 | } |
1703 | 1711 | ||
1704 | dfprintk(MOUNT, | 1712 | dfprintk(MOUNT, |
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 593fa21a02c0..3a6724c6eb5f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page, | |||
625 | int err; | 625 | int err; |
626 | 626 | ||
627 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); | 627 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); |
628 | nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), | 628 | nfs_pageio_init_write(&pgio, inode, 0, |
629 | false, &nfs_async_write_completion_ops); | 629 | false, &nfs_async_write_completion_ops); |
630 | err = nfs_do_writepage(page, wbc, &pgio, launder); | 630 | err = nfs_do_writepage(page, wbc, &pgio, launder); |
631 | nfs_pageio_complete(&pgio); | 631 | nfs_pageio_complete(&pgio); |
@@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * | |||
657 | int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) | 657 | int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) |
658 | { | 658 | { |
659 | struct inode *inode = mapping->host; | 659 | struct inode *inode = mapping->host; |
660 | unsigned long *bitlock = &NFS_I(inode)->flags; | ||
661 | struct nfs_pageio_descriptor pgio; | 660 | struct nfs_pageio_descriptor pgio; |
662 | int err; | 661 | int err; |
663 | 662 | ||
664 | /* Stop dirtying of new pages while we sync */ | ||
665 | err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, | ||
666 | nfs_wait_bit_killable, TASK_KILLABLE); | ||
667 | if (err) | ||
668 | goto out_err; | ||
669 | |||
670 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); | 663 | nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); |
671 | 664 | ||
672 | nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, | 665 | nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, |
@@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
674 | err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); | 667 | err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); |
675 | nfs_pageio_complete(&pgio); | 668 | nfs_pageio_complete(&pgio); |
676 | 669 | ||
677 | clear_bit_unlock(NFS_INO_FLUSHING, bitlock); | ||
678 | smp_mb__after_atomic(); | ||
679 | wake_up_bit(bitlock, NFS_INO_FLUSHING); | ||
680 | |||
681 | if (err < 0) | 670 | if (err < 0) |
682 | goto out_err; | 671 | goto out_err; |
683 | err = pgio.pg_error; | 672 | err = pgio.pg_error; |
@@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode) | |||
1195 | /* | 1184 | /* |
1196 | * Test if the open context credential key is marked to expire soon. | 1185 | * Test if the open context credential key is marked to expire soon. |
1197 | */ | 1186 | */ |
1198 | bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) | 1187 | bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) |
1199 | { | 1188 | { |
1200 | return rpcauth_cred_key_to_expire(ctx->cred); | 1189 | struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; |
1190 | |||
1191 | return rpcauth_cred_key_to_expire(auth, ctx->cred); | ||
1201 | } | 1192 | } |
1202 | 1193 | ||
1203 | /* | 1194 | /* |
@@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page, | |||
1289 | dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", | 1280 | dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", |
1290 | file, count, (long long)(page_file_offset(page) + offset)); | 1281 | file, count, (long long)(page_file_offset(page) + offset)); |
1291 | 1282 | ||
1283 | if (!count) | ||
1284 | goto out; | ||
1285 | |||
1292 | if (nfs_can_extend_write(file, page, inode)) { | 1286 | if (nfs_can_extend_write(file, page, inode)) { |
1293 | count = max(count + offset, nfs_page_length(page)); | 1287 | count = max(count + offset, nfs_page_length(page)); |
1294 | offset = 0; | 1288 | offset = 0; |
@@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page, | |||
1299 | nfs_set_pageerror(page); | 1293 | nfs_set_pageerror(page); |
1300 | else | 1294 | else |
1301 | __set_page_dirty_nobuffers(page); | 1295 | __set_page_dirty_nobuffers(page); |
1302 | 1296 | out: | |
1303 | dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", | 1297 | dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", |
1304 | status, (long long)i_size_read(inode)); | 1298 | status, (long long)i_size_read(inode)); |
1305 | return status; | 1299 | return status; |
@@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) | |||
1800 | 1794 | ||
1801 | /* Okay, COMMIT succeeded, apparently. Check the verifier | 1795 | /* Okay, COMMIT succeeded, apparently. Check the verifier |
1802 | * returned by the server against all stored verfs. */ | 1796 | * returned by the server against all stored verfs. */ |
1803 | if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { | 1797 | if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { |
1804 | /* We have a match */ | 1798 | /* We have a match */ |
1805 | nfs_inode_remove_request(req); | 1799 | nfs_inode_remove_request(req); |
1806 | dprintk(" OK\n"); | 1800 | dprintk(" OK\n"); |
@@ -1924,6 +1918,24 @@ out_mark_dirty: | |||
1924 | EXPORT_SYMBOL_GPL(nfs_write_inode); | 1918 | EXPORT_SYMBOL_GPL(nfs_write_inode); |
1925 | 1919 | ||
1926 | /* | 1920 | /* |
1921 | * Wrapper for filemap_write_and_wait_range() | ||
1922 | * | ||
1923 | * Needed for pNFS in order to ensure data becomes visible to the | ||
1924 | * client. | ||
1925 | */ | ||
1926 | int nfs_filemap_write_and_wait_range(struct address_space *mapping, | ||
1927 | loff_t lstart, loff_t lend) | ||
1928 | { | ||
1929 | int ret; | ||
1930 | |||
1931 | ret = filemap_write_and_wait_range(mapping, lstart, lend); | ||
1932 | if (ret == 0) | ||
1933 | ret = pnfs_sync_inode(mapping->host, true); | ||
1934 | return ret; | ||
1935 | } | ||
1936 | EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range); | ||
1937 | |||
1938 | /* | ||
1927 | * flush the inode to disk. | 1939 | * flush the inode to disk. |
1928 | */ | 1940 | */ |
1929 | int nfs_wb_all(struct inode *inode) | 1941 | int nfs_wb_all(struct inode *inode) |
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d71278c3c5bd..810124b33327 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h | |||
@@ -205,12 +205,12 @@ struct nfs_inode { | |||
205 | #define NFS_INO_STALE (1) /* possible stale inode */ | 205 | #define NFS_INO_STALE (1) /* possible stale inode */ |
206 | #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ | 206 | #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ |
207 | #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ | 207 | #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ |
208 | #define NFS_INO_FLUSHING (4) /* inode is flushing out data */ | ||
209 | #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ | 208 | #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ |
210 | #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ | 209 | #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ |
211 | #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ | 210 | #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ |
212 | #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ | 211 | #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ |
213 | #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ | 212 | #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ |
213 | #define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ | ||
214 | 214 | ||
215 | static inline struct nfs_inode *NFS_I(const struct inode *inode) | 215 | static inline struct nfs_inode *NFS_I(const struct inode *inode) |
216 | { | 216 | { |
@@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino | |||
351 | extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); | 351 | extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); |
352 | extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); | 352 | extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); |
353 | extern int nfs_revalidate_mapping_rcu(struct inode *inode); | 353 | extern int nfs_revalidate_mapping_rcu(struct inode *inode); |
354 | extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping); | ||
355 | extern int nfs_setattr(struct dentry *, struct iattr *); | 354 | extern int nfs_setattr(struct dentry *, struct iattr *); |
356 | extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); | 355 | extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); |
357 | extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, | 356 | extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, |
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c304a11b5b1a..82b81a1c2438 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h | |||
@@ -1596,9 +1596,8 @@ struct nfs_rpc_ops { | |||
1596 | int (*have_delegation)(struct inode *, fmode_t); | 1596 | int (*have_delegation)(struct inode *, fmode_t); |
1597 | int (*return_delegation)(struct inode *); | 1597 | int (*return_delegation)(struct inode *); |
1598 | struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); | 1598 | struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); |
1599 | struct nfs_client * | 1599 | struct nfs_client *(*init_client) (struct nfs_client *, |
1600 | (*init_client) (struct nfs_client *, const struct rpc_timeout *, | 1600 | const struct nfs_client_initdata *); |
1601 | const char *); | ||
1602 | void (*free_client) (struct nfs_client *); | 1601 | void (*free_client) (struct nfs_client *); |
1603 | struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); | 1602 | struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); |
1604 | struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, | 1603 | struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, |
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 899791573a40..4ccf184e971f 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h | |||
@@ -37,7 +37,6 @@ struct rpcsec_gss_info; | |||
37 | 37 | ||
38 | /* auth_cred ac_flags bits */ | 38 | /* auth_cred ac_flags bits */ |
39 | enum { | 39 | enum { |
40 | RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */ | ||
41 | RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ | 40 | RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ |
42 | RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying | 41 | RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying |
43 | key will expire soon */ | 42 | key will expire soon */ |
@@ -82,6 +81,9 @@ struct rpc_cred { | |||
82 | 81 | ||
83 | #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 | 82 | #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 |
84 | 83 | ||
84 | /* rpc_auth au_flags */ | ||
85 | #define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */ | ||
86 | |||
85 | /* | 87 | /* |
86 | * Client authentication handle | 88 | * Client authentication handle |
87 | */ | 89 | */ |
@@ -107,6 +109,9 @@ struct rpc_auth { | |||
107 | /* per-flavor data */ | 109 | /* per-flavor data */ |
108 | }; | 110 | }; |
109 | 111 | ||
112 | /* rpc_auth au_flags */ | ||
113 | #define RPCAUTH_AUTH_DATATOUCH 0x00000002 | ||
114 | |||
110 | struct rpc_auth_create_args { | 115 | struct rpc_auth_create_args { |
111 | rpc_authflavor_t pseudoflavor; | 116 | rpc_authflavor_t pseudoflavor; |
112 | const char *target_name; | 117 | const char *target_name; |
@@ -196,7 +201,7 @@ void rpcauth_destroy_credcache(struct rpc_auth *); | |||
196 | void rpcauth_clear_credcache(struct rpc_cred_cache *); | 201 | void rpcauth_clear_credcache(struct rpc_cred_cache *); |
197 | int rpcauth_key_timeout_notify(struct rpc_auth *, | 202 | int rpcauth_key_timeout_notify(struct rpc_auth *, |
198 | struct rpc_cred *); | 203 | struct rpc_cred *); |
199 | bool rpcauth_cred_key_to_expire(struct rpc_cred *); | 204 | bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *); |
200 | char * rpcauth_stringify_acceptor(struct rpc_cred *); | 205 | char * rpcauth_stringify_acceptor(struct rpc_cred *); |
201 | 206 | ||
202 | static inline | 207 | static inline |
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index 1f911ccb2a75..68ec78c1aa48 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h | |||
@@ -73,6 +73,7 @@ u32 gss_delete_sec_context( | |||
73 | rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop, | 73 | rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop, |
74 | u32 service); | 74 | u32 service); |
75 | u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor); | 75 | u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor); |
76 | bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor); | ||
76 | char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); | 77 | char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); |
77 | 78 | ||
78 | struct pf_desc { | 79 | struct pf_desc { |
@@ -81,6 +82,7 @@ struct pf_desc { | |||
81 | u32 service; | 82 | u32 service; |
82 | char *name; | 83 | char *name; |
83 | char *auth_domain_name; | 84 | char *auth_domain_name; |
85 | bool datatouch; | ||
84 | }; | 86 | }; |
85 | 87 | ||
86 | /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and | 88 | /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and |
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 05a1809c44d9..817af0b4385e 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h | |||
@@ -230,6 +230,10 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *, | |||
230 | struct rpc_task *); | 230 | struct rpc_task *); |
231 | void rpc_wake_up(struct rpc_wait_queue *); | 231 | void rpc_wake_up(struct rpc_wait_queue *); |
232 | struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); | 232 | struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); |
233 | struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, | ||
234 | struct rpc_wait_queue *, | ||
235 | bool (*)(struct rpc_task *, void *), | ||
236 | void *); | ||
233 | struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, | 237 | struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, |
234 | bool (*)(struct rpc_task *, void *), | 238 | bool (*)(struct rpc_task *, void *), |
235 | void *); | 239 | void *); |
@@ -247,6 +251,7 @@ void rpc_show_tasks(struct net *); | |||
247 | int rpc_init_mempool(void); | 251 | int rpc_init_mempool(void); |
248 | void rpc_destroy_mempool(void); | 252 | void rpc_destroy_mempool(void); |
249 | extern struct workqueue_struct *rpciod_workqueue; | 253 | extern struct workqueue_struct *rpciod_workqueue; |
254 | extern struct workqueue_struct *xprtiod_workqueue; | ||
250 | void rpc_prepare_task(struct rpc_task *task); | 255 | void rpc_prepare_task(struct rpc_task *task); |
251 | 256 | ||
252 | static inline int rpc_wait_for_completion_task(struct rpc_task *task) | 257 | static inline int rpc_wait_for_completion_task(struct rpc_task *task) |
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 0ece4ba06f06..bef3fb0abb8f 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h | |||
@@ -80,6 +80,7 @@ struct sock_xprt { | |||
80 | #define TCP_RPC_REPLY (1UL << 6) | 80 | #define TCP_RPC_REPLY (1UL << 6) |
81 | 81 | ||
82 | #define XPRT_SOCK_CONNECTING 1U | 82 | #define XPRT_SOCK_CONNECTING 1U |
83 | #define XPRT_SOCK_DATA_READY (2) | ||
83 | 84 | ||
84 | #endif /* __KERNEL__ */ | 85 | #endif /* __KERNEL__ */ |
85 | 86 | ||
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 040ff627c18a..a7e42f9a405c 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c | |||
@@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) | |||
51 | ret = kstrtoul(val, 0, &num); | 51 | ret = kstrtoul(val, 0, &num); |
52 | if (ret == -EINVAL) | 52 | if (ret == -EINVAL) |
53 | goto out_inval; | 53 | goto out_inval; |
54 | nbits = fls(num); | 54 | nbits = fls(num - 1); |
55 | if (num > (1U << nbits)) | ||
56 | nbits++; | ||
57 | if (nbits > MAX_HASHTABLE_BITS || nbits < 2) | 55 | if (nbits > MAX_HASHTABLE_BITS || nbits < 2) |
58 | goto out_inval; | 56 | goto out_inval; |
59 | *(unsigned int *)kp->arg = nbits; | 57 | *(unsigned int *)kp->arg = nbits; |
@@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) | |||
359 | EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); | 357 | EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); |
360 | 358 | ||
361 | bool | 359 | bool |
362 | rpcauth_cred_key_to_expire(struct rpc_cred *cred) | 360 | rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) |
363 | { | 361 | { |
362 | if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) | ||
363 | return false; | ||
364 | if (!cred->cr_ops->crkey_to_expire) | 364 | if (!cred->cr_ops->crkey_to_expire) |
365 | return false; | 365 | return false; |
366 | return cred->cr_ops->crkey_to_expire(cred); | 366 | return cred->cr_ops->crkey_to_expire(cred); |
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 54dd3fdead54..168219535a34 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c | |||
@@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) | |||
224 | 224 | ||
225 | 225 | ||
226 | /* Fast track for non crkey_timeout (no key) underlying credentials */ | 226 | /* Fast track for non crkey_timeout (no key) underlying credentials */ |
227 | if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) | 227 | if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) |
228 | return 0; | 228 | return 0; |
229 | 229 | ||
230 | /* Fast track for the normal case */ | 230 | /* Fast track for the normal case */ |
@@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) | |||
236 | if (IS_ERR(tcred)) | 236 | if (IS_ERR(tcred)) |
237 | return -EACCES; | 237 | return -EACCES; |
238 | 238 | ||
239 | if (!tcred->cr_ops->crkey_timeout) { | ||
240 | set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); | ||
241 | ret = 0; | ||
242 | goto out_put; | ||
243 | } | ||
244 | |||
245 | /* Test for the almost error case */ | 239 | /* Test for the almost error case */ |
246 | ret = tcred->cr_ops->crkey_timeout(tcred); | 240 | ret = tcred->cr_ops->crkey_timeout(tcred); |
247 | if (ret != 0) { | 241 | if (ret != 0) { |
@@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) | |||
257 | set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); | 251 | set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); |
258 | } | 252 | } |
259 | 253 | ||
260 | out_put: | ||
261 | put_rpccred(tcred); | 254 | put_rpccred(tcred); |
262 | return ret; | 255 | return ret; |
263 | } | 256 | } |
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index e64ae93d5b4f..23c8e7c39656 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c | |||
@@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) | |||
1015 | auth = &gss_auth->rpc_auth; | 1015 | auth = &gss_auth->rpc_auth; |
1016 | auth->au_cslack = GSS_CRED_SLACK >> 2; | 1016 | auth->au_cslack = GSS_CRED_SLACK >> 2; |
1017 | auth->au_rslack = GSS_VERF_SLACK >> 2; | 1017 | auth->au_rslack = GSS_VERF_SLACK >> 2; |
1018 | auth->au_flags = 0; | ||
1018 | auth->au_ops = &authgss_ops; | 1019 | auth->au_ops = &authgss_ops; |
1019 | auth->au_flavor = flavor; | 1020 | auth->au_flavor = flavor; |
1021 | if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) | ||
1022 | auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; | ||
1020 | atomic_set(&auth->au_count, 1); | 1023 | atomic_set(&auth->au_count, 1); |
1021 | kref_init(&gss_auth->kref); | 1024 | kref_init(&gss_auth->kref); |
1022 | 1025 | ||
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 65427492b1c9..60595835317a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c | |||
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = { | |||
745 | .qop = GSS_C_QOP_DEFAULT, | 745 | .qop = GSS_C_QOP_DEFAULT, |
746 | .service = RPC_GSS_SVC_INTEGRITY, | 746 | .service = RPC_GSS_SVC_INTEGRITY, |
747 | .name = "krb5i", | 747 | .name = "krb5i", |
748 | .datatouch = true, | ||
748 | }, | 749 | }, |
749 | [2] = { | 750 | [2] = { |
750 | .pseudoflavor = RPC_AUTH_GSS_KRB5P, | 751 | .pseudoflavor = RPC_AUTH_GSS_KRB5P, |
751 | .qop = GSS_C_QOP_DEFAULT, | 752 | .qop = GSS_C_QOP_DEFAULT, |
752 | .service = RPC_GSS_SVC_PRIVACY, | 753 | .service = RPC_GSS_SVC_PRIVACY, |
753 | .name = "krb5p", | 754 | .name = "krb5p", |
755 | .datatouch = true, | ||
754 | }, | 756 | }, |
755 | }; | 757 | }; |
756 | 758 | ||
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 7063d856a598..5fec3abbe19b 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c | |||
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) | |||
361 | } | 361 | } |
362 | EXPORT_SYMBOL(gss_pseudoflavor_to_service); | 362 | EXPORT_SYMBOL(gss_pseudoflavor_to_service); |
363 | 363 | ||
364 | bool | ||
365 | gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor) | ||
366 | { | ||
367 | int i; | ||
368 | |||
369 | for (i = 0; i < gm->gm_pf_num; i++) { | ||
370 | if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) | ||
371 | return gm->gm_pfs[i].datatouch; | ||
372 | } | ||
373 | return false; | ||
374 | } | ||
375 | |||
364 | char * | 376 | char * |
365 | gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) | 377 | gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) |
366 | { | 378 | { |
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 8d9eb4d5ddd8..4d17376b2acb 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c | |||
@@ -115,6 +115,7 @@ static | |||
115 | struct rpc_auth null_auth = { | 115 | struct rpc_auth null_auth = { |
116 | .au_cslack = NUL_CALLSLACK, | 116 | .au_cslack = NUL_CALLSLACK, |
117 | .au_rslack = NUL_REPLYSLACK, | 117 | .au_rslack = NUL_REPLYSLACK, |
118 | .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, | ||
118 | .au_ops = &authnull_ops, | 119 | .au_ops = &authnull_ops, |
119 | .au_flavor = RPC_AUTH_NULL, | 120 | .au_flavor = RPC_AUTH_NULL, |
120 | .au_count = ATOMIC_INIT(0), | 121 | .au_count = ATOMIC_INIT(0), |
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 9f65452b7cbc..a99278c984e8 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c | |||
@@ -228,6 +228,7 @@ static | |||
228 | struct rpc_auth unix_auth = { | 228 | struct rpc_auth unix_auth = { |
229 | .au_cslack = UNX_CALLSLACK, | 229 | .au_cslack = UNX_CALLSLACK, |
230 | .au_rslack = NUL_REPLYSLACK, | 230 | .au_rslack = NUL_REPLYSLACK, |
231 | .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, | ||
231 | .au_ops = &authunix_ops, | 232 | .au_ops = &authunix_ops, |
232 | .au_flavor = RPC_AUTH_UNIX, | 233 | .au_flavor = RPC_AUTH_UNIX, |
233 | .au_count = ATOMIC_INIT(0), | 234 | .au_count = ATOMIC_INIT(0), |
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2808d550d273..cb49898a5a58 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c | |||
@@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata) | |||
2577 | kfree(data); | 2577 | kfree(data); |
2578 | } | 2578 | } |
2579 | 2579 | ||
2580 | const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { | 2580 | static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { |
2581 | .rpc_call_done = rpc_cb_add_xprt_done, | 2581 | .rpc_call_done = rpc_cb_add_xprt_done, |
2582 | .rpc_release = rpc_cb_add_xprt_release, | 2582 | .rpc_release = rpc_cb_add_xprt_release, |
2583 | }; | 2583 | }; |
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fcfd48d263f6..9ae588511aaf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c | |||
@@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue; | |||
54 | /* | 54 | /* |
55 | * rpciod-related stuff | 55 | * rpciod-related stuff |
56 | */ | 56 | */ |
57 | struct workqueue_struct *rpciod_workqueue; | 57 | struct workqueue_struct *rpciod_workqueue __read_mostly; |
58 | struct workqueue_struct *xprtiod_workqueue __read_mostly; | ||
58 | 59 | ||
59 | /* | 60 | /* |
60 | * Disable the timer for a given RPC task. Should be called with | 61 | * Disable the timer for a given RPC task. Should be called with |
@@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); | |||
329 | * lockless RPC_IS_QUEUED() test) before we've had a chance to test | 330 | * lockless RPC_IS_QUEUED() test) before we've had a chance to test |
330 | * the RPC_TASK_RUNNING flag. | 331 | * the RPC_TASK_RUNNING flag. |
331 | */ | 332 | */ |
332 | static void rpc_make_runnable(struct rpc_task *task) | 333 | static void rpc_make_runnable(struct workqueue_struct *wq, |
334 | struct rpc_task *task) | ||
333 | { | 335 | { |
334 | bool need_wakeup = !rpc_test_and_set_running(task); | 336 | bool need_wakeup = !rpc_test_and_set_running(task); |
335 | 337 | ||
@@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task) | |||
338 | return; | 340 | return; |
339 | if (RPC_IS_ASYNC(task)) { | 341 | if (RPC_IS_ASYNC(task)) { |
340 | INIT_WORK(&task->u.tk_work, rpc_async_schedule); | 342 | INIT_WORK(&task->u.tk_work, rpc_async_schedule); |
341 | queue_work(rpciod_workqueue, &task->u.tk_work); | 343 | queue_work(wq, &task->u.tk_work); |
342 | } else | 344 | } else |
343 | wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); | 345 | wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); |
344 | } | 346 | } |
@@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, | |||
407 | EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); | 409 | EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); |
408 | 410 | ||
409 | /** | 411 | /** |
410 | * __rpc_do_wake_up_task - wake up a single rpc_task | 412 | * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task |
413 | * @wq: workqueue on which to run task | ||
411 | * @queue: wait queue | 414 | * @queue: wait queue |
412 | * @task: task to be woken up | 415 | * @task: task to be woken up |
413 | * | 416 | * |
414 | * Caller must hold queue->lock, and have cleared the task queued flag. | 417 | * Caller must hold queue->lock, and have cleared the task queued flag. |
415 | */ | 418 | */ |
416 | static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) | 419 | static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, |
420 | struct rpc_wait_queue *queue, | ||
421 | struct rpc_task *task) | ||
417 | { | 422 | { |
418 | dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", | 423 | dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", |
419 | task->tk_pid, jiffies); | 424 | task->tk_pid, jiffies); |
@@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task | |||
428 | 433 | ||
429 | __rpc_remove_wait_queue(queue, task); | 434 | __rpc_remove_wait_queue(queue, task); |
430 | 435 | ||
431 | rpc_make_runnable(task); | 436 | rpc_make_runnable(wq, task); |
432 | 437 | ||
433 | dprintk("RPC: __rpc_wake_up_task done\n"); | 438 | dprintk("RPC: __rpc_wake_up_task done\n"); |
434 | } | 439 | } |
@@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task | |||
436 | /* | 441 | /* |
437 | * Wake up a queued task while the queue lock is being held | 442 | * Wake up a queued task while the queue lock is being held |
438 | */ | 443 | */ |
439 | static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) | 444 | static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, |
445 | struct rpc_wait_queue *queue, struct rpc_task *task) | ||
440 | { | 446 | { |
441 | if (RPC_IS_QUEUED(task)) { | 447 | if (RPC_IS_QUEUED(task)) { |
442 | smp_rmb(); | 448 | smp_rmb(); |
443 | if (task->tk_waitqueue == queue) | 449 | if (task->tk_waitqueue == queue) |
444 | __rpc_do_wake_up_task(queue, task); | 450 | __rpc_do_wake_up_task_on_wq(wq, queue, task); |
445 | } | 451 | } |
446 | } | 452 | } |
447 | 453 | ||
448 | /* | 454 | /* |
455 | * Wake up a queued task while the queue lock is being held | ||
456 | */ | ||
457 | static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) | ||
458 | { | ||
459 | rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); | ||
460 | } | ||
461 | |||
462 | /* | ||
449 | * Wake up a task on a specific queue | 463 | * Wake up a task on a specific queue |
450 | */ | 464 | */ |
451 | void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) | 465 | void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) |
@@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue) | |||
518 | /* | 532 | /* |
519 | * Wake up the first task on the wait queue. | 533 | * Wake up the first task on the wait queue. |
520 | */ | 534 | */ |
521 | struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | 535 | struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, |
536 | struct rpc_wait_queue *queue, | ||
522 | bool (*func)(struct rpc_task *, void *), void *data) | 537 | bool (*func)(struct rpc_task *, void *), void *data) |
523 | { | 538 | { |
524 | struct rpc_task *task = NULL; | 539 | struct rpc_task *task = NULL; |
@@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | |||
529 | task = __rpc_find_next_queued(queue); | 544 | task = __rpc_find_next_queued(queue); |
530 | if (task != NULL) { | 545 | if (task != NULL) { |
531 | if (func(task, data)) | 546 | if (func(task, data)) |
532 | rpc_wake_up_task_queue_locked(queue, task); | 547 | rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); |
533 | else | 548 | else |
534 | task = NULL; | 549 | task = NULL; |
535 | } | 550 | } |
@@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | |||
537 | 552 | ||
538 | return task; | 553 | return task; |
539 | } | 554 | } |
555 | |||
556 | /* | ||
557 | * Wake up the first task on the wait queue. | ||
558 | */ | ||
559 | struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, | ||
560 | bool (*func)(struct rpc_task *, void *), void *data) | ||
561 | { | ||
562 | return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); | ||
563 | } | ||
540 | EXPORT_SYMBOL_GPL(rpc_wake_up_first); | 564 | EXPORT_SYMBOL_GPL(rpc_wake_up_first); |
541 | 565 | ||
542 | static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) | 566 | static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) |
@@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task) | |||
814 | bool is_async = RPC_IS_ASYNC(task); | 838 | bool is_async = RPC_IS_ASYNC(task); |
815 | 839 | ||
816 | rpc_set_active(task); | 840 | rpc_set_active(task); |
817 | rpc_make_runnable(task); | 841 | rpc_make_runnable(rpciod_workqueue, task); |
818 | if (!is_async) | 842 | if (!is_async) |
819 | __rpc_execute(task); | 843 | __rpc_execute(task); |
820 | } | 844 | } |
@@ -1071,10 +1095,22 @@ static int rpciod_start(void) | |||
1071 | * Create the rpciod thread and wait for it to start. | 1095 | * Create the rpciod thread and wait for it to start. |
1072 | */ | 1096 | */ |
1073 | dprintk("RPC: creating workqueue rpciod\n"); | 1097 | dprintk("RPC: creating workqueue rpciod\n"); |
1074 | /* Note: highpri because network receive is latency sensitive */ | 1098 | wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); |
1075 | wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); | 1099 | if (!wq) |
1100 | goto out_failed; | ||
1076 | rpciod_workqueue = wq; | 1101 | rpciod_workqueue = wq; |
1077 | return rpciod_workqueue != NULL; | 1102 | /* Note: highpri because network receive is latency sensitive */ |
1103 | wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); | ||
1104 | if (!wq) | ||
1105 | goto free_rpciod; | ||
1106 | xprtiod_workqueue = wq; | ||
1107 | return 1; | ||
1108 | free_rpciod: | ||
1109 | wq = rpciod_workqueue; | ||
1110 | rpciod_workqueue = NULL; | ||
1111 | destroy_workqueue(wq); | ||
1112 | out_failed: | ||
1113 | return 0; | ||
1078 | } | 1114 | } |
1079 | 1115 | ||
1080 | static void rpciod_stop(void) | 1116 | static void rpciod_stop(void) |
@@ -1088,6 +1124,9 @@ static void rpciod_stop(void) | |||
1088 | wq = rpciod_workqueue; | 1124 | wq = rpciod_workqueue; |
1089 | rpciod_workqueue = NULL; | 1125 | rpciod_workqueue = NULL; |
1090 | destroy_workqueue(wq); | 1126 | destroy_workqueue(wq); |
1127 | wq = xprtiod_workqueue; | ||
1128 | xprtiod_workqueue = NULL; | ||
1129 | destroy_workqueue(wq); | ||
1091 | } | 1130 | } |
1092 | 1131 | ||
1093 | void | 1132 | void |
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index cc9852897395..c5b0cb4f4056 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c | |||
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) | |||
1188 | *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); | 1188 | *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); |
1189 | 1189 | ||
1190 | /* Encode reply */ | 1190 | /* Encode reply */ |
1191 | if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { | 1191 | if (*statp == rpc_drop_reply || |
1192 | test_bit(RQ_DROPME, &rqstp->rq_flags)) { | ||
1192 | if (procp->pc_release) | 1193 | if (procp->pc_release) |
1193 | procp->pc_release(rqstp, NULL, rqstp->rq_resp); | 1194 | procp->pc_release(rqstp, NULL, rqstp->rq_resp); |
1194 | goto dropit; | 1195 | goto dropit; |
1195 | } | 1196 | } |
1197 | if (*statp == rpc_autherr_badcred) { | ||
1198 | if (procp->pc_release) | ||
1199 | procp->pc_release(rqstp, NULL, rqstp->rq_resp); | ||
1200 | goto err_bad_auth; | ||
1201 | } | ||
1196 | if (*statp == rpc_success && | 1202 | if (*statp == rpc_success && |
1197 | (xdr = procp->pc_encode) && | 1203 | (xdr = procp->pc_encode) && |
1198 | !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { | 1204 | !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { |
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 216a1385718a..8313960cac52 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c | |||
@@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) | |||
220 | clear_bit(XPRT_LOCKED, &xprt->state); | 220 | clear_bit(XPRT_LOCKED, &xprt->state); |
221 | smp_mb__after_atomic(); | 221 | smp_mb__after_atomic(); |
222 | } else | 222 | } else |
223 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 223 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
224 | } | 224 | } |
225 | 225 | ||
226 | /* | 226 | /* |
@@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) | |||
295 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) | 295 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) |
296 | return; | 296 | return; |
297 | 297 | ||
298 | if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) | 298 | if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, |
299 | __xprt_lock_write_func, xprt)) | ||
299 | return; | 300 | return; |
300 | xprt_clear_locked(xprt); | 301 | xprt_clear_locked(xprt); |
301 | } | 302 | } |
@@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) | |||
324 | return; | 325 | return; |
325 | if (RPCXPRT_CONGESTED(xprt)) | 326 | if (RPCXPRT_CONGESTED(xprt)) |
326 | goto out_unlock; | 327 | goto out_unlock; |
327 | if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) | 328 | if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, |
329 | __xprt_lock_write_cong_func, xprt)) | ||
328 | return; | 330 | return; |
329 | out_unlock: | 331 | out_unlock: |
330 | xprt_clear_locked(xprt); | 332 | xprt_clear_locked(xprt); |
@@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) | |||
645 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); | 647 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); |
646 | /* Try to schedule an autoclose RPC call */ | 648 | /* Try to schedule an autoclose RPC call */ |
647 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) | 649 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) |
648 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 650 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
649 | xprt_wake_pending_tasks(xprt, -EAGAIN); | 651 | xprt_wake_pending_tasks(xprt, -EAGAIN); |
650 | spin_unlock_bh(&xprt->transport_lock); | 652 | spin_unlock_bh(&xprt->transport_lock); |
651 | } | 653 | } |
@@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) | |||
672 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); | 674 | set_bit(XPRT_CLOSE_WAIT, &xprt->state); |
673 | /* Try to schedule an autoclose RPC call */ | 675 | /* Try to schedule an autoclose RPC call */ |
674 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) | 676 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) |
675 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 677 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
676 | xprt_wake_pending_tasks(xprt, -EAGAIN); | 678 | xprt_wake_pending_tasks(xprt, -EAGAIN); |
677 | out: | 679 | out: |
678 | spin_unlock_bh(&xprt->transport_lock); | 680 | spin_unlock_bh(&xprt->transport_lock); |
@@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data) | |||
689 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) | 691 | if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) |
690 | goto out_abort; | 692 | goto out_abort; |
691 | spin_unlock(&xprt->transport_lock); | 693 | spin_unlock(&xprt->transport_lock); |
692 | queue_work(rpciod_workqueue, &xprt->task_cleanup); | 694 | queue_work(xprtiod_workqueue, &xprt->task_cleanup); |
693 | return; | 695 | return; |
694 | out_abort: | 696 | out_abort: |
695 | spin_unlock(&xprt->transport_lock); | 697 | spin_unlock(&xprt->transport_lock); |
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index e7fd76975d86..66c9d63f4797 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c | |||
@@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi, | |||
271 | xprt_switch_find_xprt_t find_next) | 271 | xprt_switch_find_xprt_t find_next) |
272 | { | 272 | { |
273 | struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); | 273 | struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); |
274 | struct list_head *head; | ||
275 | 274 | ||
276 | if (xps == NULL) | 275 | if (xps == NULL) |
277 | return NULL; | 276 | return NULL; |
278 | head = &xps->xps_xprt_list; | 277 | return xprt_switch_set_next_cursor(&xps->xps_xprt_list, |
279 | if (xps->xps_nxprts < 2) | 278 | &xpi->xpi_cursor, |
280 | return xprt_switch_find_first_entry(head); | 279 | find_next); |
281 | return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next); | ||
282 | } | 280 | } |
283 | 281 | ||
284 | static | 282 | static |
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index dc9f3b513a05..ef19fa42c50f 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile | |||
@@ -1,7 +1,7 @@ | |||
1 | obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o | 1 | obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o |
2 | 2 | ||
3 | rpcrdma-y := transport.o rpc_rdma.o verbs.o \ | 3 | rpcrdma-y := transport.o rpc_rdma.o verbs.o \ |
4 | fmr_ops.o frwr_ops.o physical_ops.o \ | 4 | fmr_ops.o frwr_ops.o \ |
5 | svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ | 5 | svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ |
6 | svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ | 6 | svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ |
7 | module.o | 7 | module.o |
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 6326ebe8b595..21cb3b150b37 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c | |||
@@ -19,13 +19,6 @@ | |||
19 | * verb (fmr_op_unmap). | 19 | * verb (fmr_op_unmap). |
20 | */ | 20 | */ |
21 | 21 | ||
22 | /* Transport recovery | ||
23 | * | ||
24 | * After a transport reconnect, fmr_op_map re-uses the MR already | ||
25 | * allocated for the RPC, but generates a fresh rkey then maps the | ||
26 | * MR again. This process is synchronous. | ||
27 | */ | ||
28 | |||
29 | #include "xprt_rdma.h" | 22 | #include "xprt_rdma.h" |
30 | 23 | ||
31 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) | 24 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) |
@@ -35,62 +28,132 @@ | |||
35 | /* Maximum scatter/gather per FMR */ | 28 | /* Maximum scatter/gather per FMR */ |
36 | #define RPCRDMA_MAX_FMR_SGES (64) | 29 | #define RPCRDMA_MAX_FMR_SGES (64) |
37 | 30 | ||
38 | static struct workqueue_struct *fmr_recovery_wq; | 31 | /* Access mode of externally registered pages */ |
39 | 32 | enum { | |
40 | #define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) | 33 | RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | |
34 | IB_ACCESS_REMOTE_READ, | ||
35 | }; | ||
41 | 36 | ||
42 | int | 37 | bool |
43 | fmr_alloc_recovery_wq(void) | 38 | fmr_is_supported(struct rpcrdma_ia *ia) |
44 | { | 39 | { |
45 | fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); | 40 | if (!ia->ri_device->alloc_fmr) { |
46 | return !fmr_recovery_wq ? -ENOMEM : 0; | 41 | pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", |
42 | ia->ri_device->name); | ||
43 | return false; | ||
44 | } | ||
45 | return true; | ||
47 | } | 46 | } |
48 | 47 | ||
49 | void | 48 | static int |
50 | fmr_destroy_recovery_wq(void) | 49 | fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) |
51 | { | 50 | { |
52 | struct workqueue_struct *wq; | 51 | static struct ib_fmr_attr fmr_attr = { |
52 | .max_pages = RPCRDMA_MAX_FMR_SGES, | ||
53 | .max_maps = 1, | ||
54 | .page_shift = PAGE_SHIFT | ||
55 | }; | ||
53 | 56 | ||
54 | if (!fmr_recovery_wq) | 57 | mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, |
55 | return; | 58 | sizeof(u64), GFP_KERNEL); |
59 | if (!mw->fmr.fm_physaddrs) | ||
60 | goto out_free; | ||
56 | 61 | ||
57 | wq = fmr_recovery_wq; | 62 | mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, |
58 | fmr_recovery_wq = NULL; | 63 | sizeof(*mw->mw_sg), GFP_KERNEL); |
59 | destroy_workqueue(wq); | 64 | if (!mw->mw_sg) |
65 | goto out_free; | ||
66 | |||
67 | sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); | ||
68 | |||
69 | mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, | ||
70 | &fmr_attr); | ||
71 | if (IS_ERR(mw->fmr.fm_mr)) | ||
72 | goto out_fmr_err; | ||
73 | |||
74 | return 0; | ||
75 | |||
76 | out_fmr_err: | ||
77 | dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, | ||
78 | PTR_ERR(mw->fmr.fm_mr)); | ||
79 | |||
80 | out_free: | ||
81 | kfree(mw->mw_sg); | ||
82 | kfree(mw->fmr.fm_physaddrs); | ||
83 | return -ENOMEM; | ||
60 | } | 84 | } |
61 | 85 | ||
62 | static int | 86 | static int |
63 | __fmr_unmap(struct rpcrdma_mw *mw) | 87 | __fmr_unmap(struct rpcrdma_mw *mw) |
64 | { | 88 | { |
65 | LIST_HEAD(l); | 89 | LIST_HEAD(l); |
90 | int rc; | ||
66 | 91 | ||
67 | list_add(&mw->fmr.fmr->list, &l); | 92 | list_add(&mw->fmr.fm_mr->list, &l); |
68 | return ib_unmap_fmr(&l); | 93 | rc = ib_unmap_fmr(&l); |
94 | list_del_init(&mw->fmr.fm_mr->list); | ||
95 | return rc; | ||
69 | } | 96 | } |
70 | 97 | ||
71 | /* Deferred reset of a single FMR. Generate a fresh rkey by | ||
72 | * replacing the MR. There's no recovery if this fails. | ||
73 | */ | ||
74 | static void | 98 | static void |
75 | __fmr_recovery_worker(struct work_struct *work) | 99 | fmr_op_release_mr(struct rpcrdma_mw *r) |
76 | { | 100 | { |
77 | struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, | 101 | LIST_HEAD(unmap_list); |
78 | mw_work); | 102 | int rc; |
79 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; | ||
80 | 103 | ||
81 | __fmr_unmap(mw); | 104 | /* Ensure MW is not on any rl_registered list */ |
82 | rpcrdma_put_mw(r_xprt, mw); | 105 | if (!list_empty(&r->mw_list)) |
83 | return; | 106 | list_del(&r->mw_list); |
107 | |||
108 | kfree(r->fmr.fm_physaddrs); | ||
109 | kfree(r->mw_sg); | ||
110 | |||
111 | /* In case this one was left mapped, try to unmap it | ||
112 | * to prevent dealloc_fmr from failing with EBUSY | ||
113 | */ | ||
114 | rc = __fmr_unmap(r); | ||
115 | if (rc) | ||
116 | pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", | ||
117 | r, rc); | ||
118 | |||
119 | rc = ib_dealloc_fmr(r->fmr.fm_mr); | ||
120 | if (rc) | ||
121 | pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", | ||
122 | r, rc); | ||
123 | |||
124 | kfree(r); | ||
84 | } | 125 | } |
85 | 126 | ||
86 | /* A broken MR was discovered in a context that can't sleep. | 127 | /* Reset of a single FMR. |
87 | * Defer recovery to the recovery worker. | ||
88 | */ | 128 | */ |
89 | static void | 129 | static void |
90 | __fmr_queue_recovery(struct rpcrdma_mw *mw) | 130 | fmr_op_recover_mr(struct rpcrdma_mw *mw) |
91 | { | 131 | { |
92 | INIT_WORK(&mw->mw_work, __fmr_recovery_worker); | 132 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; |
93 | queue_work(fmr_recovery_wq, &mw->mw_work); | 133 | int rc; |
134 | |||
135 | /* ORDER: invalidate first */ | ||
136 | rc = __fmr_unmap(mw); | ||
137 | |||
138 | /* ORDER: then DMA unmap */ | ||
139 | ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, | ||
140 | mw->mw_sg, mw->mw_nents, mw->mw_dir); | ||
141 | if (rc) | ||
142 | goto out_release; | ||
143 | |||
144 | rpcrdma_put_mw(r_xprt, mw); | ||
145 | r_xprt->rx_stats.mrs_recovered++; | ||
146 | return; | ||
147 | |||
148 | out_release: | ||
149 | pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); | ||
150 | r_xprt->rx_stats.mrs_orphaned++; | ||
151 | |||
152 | spin_lock(&r_xprt->rx_buf.rb_mwlock); | ||
153 | list_del(&mw->mw_all); | ||
154 | spin_unlock(&r_xprt->rx_buf.rb_mwlock); | ||
155 | |||
156 | fmr_op_release_mr(mw); | ||
94 | } | 157 | } |
95 | 158 | ||
96 | static int | 159 | static int |
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) | |||
112 | RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); | 175 | RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); |
113 | } | 176 | } |
114 | 177 | ||
115 | static int | ||
116 | fmr_op_init(struct rpcrdma_xprt *r_xprt) | ||
117 | { | ||
118 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
119 | int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; | ||
120 | struct ib_fmr_attr fmr_attr = { | ||
121 | .max_pages = RPCRDMA_MAX_FMR_SGES, | ||
122 | .max_maps = 1, | ||
123 | .page_shift = PAGE_SHIFT | ||
124 | }; | ||
125 | struct ib_pd *pd = r_xprt->rx_ia.ri_pd; | ||
126 | struct rpcrdma_mw *r; | ||
127 | int i, rc; | ||
128 | |||
129 | spin_lock_init(&buf->rb_mwlock); | ||
130 | INIT_LIST_HEAD(&buf->rb_mws); | ||
131 | INIT_LIST_HEAD(&buf->rb_all); | ||
132 | |||
133 | i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); | ||
134 | i += 2; /* head + tail */ | ||
135 | i *= buf->rb_max_requests; /* one set for each RPC slot */ | ||
136 | dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); | ||
137 | |||
138 | rc = -ENOMEM; | ||
139 | while (i--) { | ||
140 | r = kzalloc(sizeof(*r), GFP_KERNEL); | ||
141 | if (!r) | ||
142 | goto out; | ||
143 | |||
144 | r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * | ||
145 | sizeof(u64), GFP_KERNEL); | ||
146 | if (!r->fmr.physaddrs) | ||
147 | goto out_free; | ||
148 | |||
149 | r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); | ||
150 | if (IS_ERR(r->fmr.fmr)) | ||
151 | goto out_fmr_err; | ||
152 | |||
153 | r->mw_xprt = r_xprt; | ||
154 | list_add(&r->mw_list, &buf->rb_mws); | ||
155 | list_add(&r->mw_all, &buf->rb_all); | ||
156 | } | ||
157 | return 0; | ||
158 | |||
159 | out_fmr_err: | ||
160 | rc = PTR_ERR(r->fmr.fmr); | ||
161 | dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); | ||
162 | kfree(r->fmr.physaddrs); | ||
163 | out_free: | ||
164 | kfree(r); | ||
165 | out: | ||
166 | return rc; | ||
167 | } | ||
168 | |||
169 | /* Use the ib_map_phys_fmr() verb to register a memory region | 178 | /* Use the ib_map_phys_fmr() verb to register a memory region |
170 | * for remote access via RDMA READ or RDMA WRITE. | 179 | * for remote access via RDMA READ or RDMA WRITE. |
171 | */ | 180 | */ |
172 | static int | 181 | static int |
173 | fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | 182 | fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, |
174 | int nsegs, bool writing) | 183 | int nsegs, bool writing, struct rpcrdma_mw **out) |
175 | { | 184 | { |
176 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
177 | struct ib_device *device = ia->ri_device; | ||
178 | enum dma_data_direction direction = rpcrdma_data_dir(writing); | ||
179 | struct rpcrdma_mr_seg *seg1 = seg; | 185 | struct rpcrdma_mr_seg *seg1 = seg; |
180 | int len, pageoff, i, rc; | 186 | int len, pageoff, i, rc; |
181 | struct rpcrdma_mw *mw; | 187 | struct rpcrdma_mw *mw; |
188 | u64 *dma_pages; | ||
182 | 189 | ||
183 | mw = seg1->rl_mw; | 190 | mw = rpcrdma_get_mw(r_xprt); |
184 | seg1->rl_mw = NULL; | 191 | if (!mw) |
185 | if (!mw) { | 192 | return -ENOBUFS; |
186 | mw = rpcrdma_get_mw(r_xprt); | ||
187 | if (!mw) | ||
188 | return -ENOMEM; | ||
189 | } else { | ||
190 | /* this is a retransmit; generate a fresh rkey */ | ||
191 | rc = __fmr_unmap(mw); | ||
192 | if (rc) | ||
193 | return rc; | ||
194 | } | ||
195 | 193 | ||
196 | pageoff = offset_in_page(seg1->mr_offset); | 194 | pageoff = offset_in_page(seg1->mr_offset); |
197 | seg1->mr_offset -= pageoff; /* start of page */ | 195 | seg1->mr_offset -= pageoff; /* start of page */ |
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
200 | if (nsegs > RPCRDMA_MAX_FMR_SGES) | 198 | if (nsegs > RPCRDMA_MAX_FMR_SGES) |
201 | nsegs = RPCRDMA_MAX_FMR_SGES; | 199 | nsegs = RPCRDMA_MAX_FMR_SGES; |
202 | for (i = 0; i < nsegs;) { | 200 | for (i = 0; i < nsegs;) { |
203 | rpcrdma_map_one(device, seg, direction); | 201 | if (seg->mr_page) |
204 | mw->fmr.physaddrs[i] = seg->mr_dma; | 202 | sg_set_page(&mw->mw_sg[i], |
203 | seg->mr_page, | ||
204 | seg->mr_len, | ||
205 | offset_in_page(seg->mr_offset)); | ||
206 | else | ||
207 | sg_set_buf(&mw->mw_sg[i], seg->mr_offset, | ||
208 | seg->mr_len); | ||
205 | len += seg->mr_len; | 209 | len += seg->mr_len; |
206 | ++seg; | 210 | ++seg; |
207 | ++i; | 211 | ++i; |
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
210 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) | 214 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) |
211 | break; | 215 | break; |
212 | } | 216 | } |
213 | 217 | mw->mw_nents = i; | |
214 | rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, | 218 | mw->mw_dir = rpcrdma_data_dir(writing); |
215 | i, seg1->mr_dma); | 219 | if (i == 0) |
220 | goto out_dmamap_err; | ||
221 | |||
222 | if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, | ||
223 | mw->mw_sg, mw->mw_nents, mw->mw_dir)) | ||
224 | goto out_dmamap_err; | ||
225 | |||
226 | for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) | ||
227 | dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); | ||
228 | rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, | ||
229 | dma_pages[0]); | ||
216 | if (rc) | 230 | if (rc) |
217 | goto out_maperr; | 231 | goto out_maperr; |
218 | 232 | ||
219 | seg1->rl_mw = mw; | 233 | mw->mw_handle = mw->fmr.fm_mr->rkey; |
220 | seg1->mr_rkey = mw->fmr.fmr->rkey; | 234 | mw->mw_length = len; |
221 | seg1->mr_base = seg1->mr_dma + pageoff; | 235 | mw->mw_offset = dma_pages[0] + pageoff; |
222 | seg1->mr_nsegs = i; | ||
223 | seg1->mr_len = len; | ||
224 | return i; | ||
225 | 236 | ||
226 | out_maperr: | 237 | *out = mw; |
227 | dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", | 238 | return mw->mw_nents; |
228 | __func__, len, (unsigned long long)seg1->mr_dma, | ||
229 | pageoff, i, rc); | ||
230 | while (i--) | ||
231 | rpcrdma_unmap_one(device, --seg); | ||
232 | return rc; | ||
233 | } | ||
234 | 239 | ||
235 | static void | 240 | out_dmamap_err: |
236 | __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) | 241 | pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", |
237 | { | 242 | mw->mw_sg, mw->mw_nents); |
238 | struct ib_device *device = r_xprt->rx_ia.ri_device; | 243 | rpcrdma_defer_mr_recovery(mw); |
239 | int nsegs = seg->mr_nsegs; | 244 | return -EIO; |
240 | 245 | ||
241 | while (nsegs--) | 246 | out_maperr: |
242 | rpcrdma_unmap_one(device, seg++); | 247 | pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", |
248 | len, (unsigned long long)dma_pages[0], | ||
249 | pageoff, mw->mw_nents, rc); | ||
250 | rpcrdma_defer_mr_recovery(mw); | ||
251 | return -EIO; | ||
243 | } | 252 | } |
244 | 253 | ||
245 | /* Invalidate all memory regions that were registered for "req". | 254 | /* Invalidate all memory regions that were registered for "req". |
246 | * | 255 | * |
247 | * Sleeps until it is safe for the host CPU to access the | 256 | * Sleeps until it is safe for the host CPU to access the |
248 | * previously mapped memory regions. | 257 | * previously mapped memory regions. |
258 | * | ||
259 | * Caller ensures that req->rl_registered is not empty. | ||
249 | */ | 260 | */ |
250 | static void | 261 | static void |
251 | fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | 262 | fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
252 | { | 263 | { |
253 | struct rpcrdma_mr_seg *seg; | 264 | struct rpcrdma_mw *mw, *tmp; |
254 | unsigned int i, nchunks; | ||
255 | struct rpcrdma_mw *mw; | ||
256 | LIST_HEAD(unmap_list); | 265 | LIST_HEAD(unmap_list); |
257 | int rc; | 266 | int rc; |
258 | 267 | ||
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |||
261 | /* ORDER: Invalidate all of the req's MRs first | 270 | /* ORDER: Invalidate all of the req's MRs first |
262 | * | 271 | * |
263 | * ib_unmap_fmr() is slow, so use a single call instead | 272 | * ib_unmap_fmr() is slow, so use a single call instead |
264 | * of one call per mapped MR. | 273 | * of one call per mapped FMR. |
265 | */ | 274 | */ |
266 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 275 | list_for_each_entry(mw, &req->rl_registered, mw_list) |
267 | seg = &req->rl_segments[i]; | 276 | list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); |
268 | mw = seg->rl_mw; | ||
269 | |||
270 | list_add(&mw->fmr.fmr->list, &unmap_list); | ||
271 | |||
272 | i += seg->mr_nsegs; | ||
273 | } | ||
274 | rc = ib_unmap_fmr(&unmap_list); | 277 | rc = ib_unmap_fmr(&unmap_list); |
275 | if (rc) | 278 | if (rc) |
276 | pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); | 279 | goto out_reset; |
277 | 280 | ||
278 | /* ORDER: Now DMA unmap all of the req's MRs, and return | 281 | /* ORDER: Now DMA unmap all of the req's MRs, and return |
279 | * them to the free MW list. | 282 | * them to the free MW list. |
280 | */ | 283 | */ |
281 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 284 | list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { |
282 | seg = &req->rl_segments[i]; | 285 | list_del_init(&mw->mw_list); |
286 | list_del_init(&mw->fmr.fm_mr->list); | ||
287 | ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, | ||
288 | mw->mw_sg, mw->mw_nents, mw->mw_dir); | ||
289 | rpcrdma_put_mw(r_xprt, mw); | ||
290 | } | ||
283 | 291 | ||
284 | __fmr_dma_unmap(r_xprt, seg); | 292 | return; |
285 | rpcrdma_put_mw(r_xprt, seg->rl_mw); | ||
286 | 293 | ||
287 | i += seg->mr_nsegs; | 294 | out_reset: |
288 | seg->mr_nsegs = 0; | 295 | pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); |
289 | seg->rl_mw = NULL; | ||
290 | } | ||
291 | 296 | ||
292 | req->rl_nchunks = 0; | 297 | list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { |
298 | list_del_init(&mw->fmr.fm_mr->list); | ||
299 | fmr_op_recover_mr(mw); | ||
300 | } | ||
293 | } | 301 | } |
294 | 302 | ||
295 | /* Use a slow, safe mechanism to invalidate all memory regions | 303 | /* Use a slow, safe mechanism to invalidate all memory regions |
296 | * that were registered for "req". | 304 | * that were registered for "req". |
297 | * | ||
298 | * In the asynchronous case, DMA unmapping occurs first here | ||
299 | * because the rpcrdma_mr_seg is released immediately after this | ||
300 | * call. It's contents won't be available in __fmr_dma_unmap later. | ||
301 | * FIXME. | ||
302 | */ | 305 | */ |
303 | static void | 306 | static void |
304 | fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | 307 | fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, |
305 | bool sync) | 308 | bool sync) |
306 | { | 309 | { |
307 | struct rpcrdma_mr_seg *seg; | ||
308 | struct rpcrdma_mw *mw; | 310 | struct rpcrdma_mw *mw; |
309 | unsigned int i; | ||
310 | |||
311 | for (i = 0; req->rl_nchunks; req->rl_nchunks--) { | ||
312 | seg = &req->rl_segments[i]; | ||
313 | mw = seg->rl_mw; | ||
314 | |||
315 | if (sync) { | ||
316 | /* ORDER */ | ||
317 | __fmr_unmap(mw); | ||
318 | __fmr_dma_unmap(r_xprt, seg); | ||
319 | rpcrdma_put_mw(r_xprt, mw); | ||
320 | } else { | ||
321 | __fmr_dma_unmap(r_xprt, seg); | ||
322 | __fmr_queue_recovery(mw); | ||
323 | } | ||
324 | |||
325 | i += seg->mr_nsegs; | ||
326 | seg->mr_nsegs = 0; | ||
327 | seg->rl_mw = NULL; | ||
328 | } | ||
329 | } | ||
330 | |||
331 | static void | ||
332 | fmr_op_destroy(struct rpcrdma_buffer *buf) | ||
333 | { | ||
334 | struct rpcrdma_mw *r; | ||
335 | int rc; | ||
336 | |||
337 | while (!list_empty(&buf->rb_all)) { | ||
338 | r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); | ||
339 | list_del(&r->mw_all); | ||
340 | kfree(r->fmr.physaddrs); | ||
341 | 311 | ||
342 | rc = ib_dealloc_fmr(r->fmr.fmr); | 312 | while (!list_empty(&req->rl_registered)) { |
343 | if (rc) | 313 | mw = list_first_entry(&req->rl_registered, |
344 | dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", | 314 | struct rpcrdma_mw, mw_list); |
345 | __func__, rc); | 315 | list_del_init(&mw->mw_list); |
346 | 316 | ||
347 | kfree(r); | 317 | if (sync) |
318 | fmr_op_recover_mr(mw); | ||
319 | else | ||
320 | rpcrdma_defer_mr_recovery(mw); | ||
348 | } | 321 | } |
349 | } | 322 | } |
350 | 323 | ||
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { | |||
352 | .ro_map = fmr_op_map, | 325 | .ro_map = fmr_op_map, |
353 | .ro_unmap_sync = fmr_op_unmap_sync, | 326 | .ro_unmap_sync = fmr_op_unmap_sync, |
354 | .ro_unmap_safe = fmr_op_unmap_safe, | 327 | .ro_unmap_safe = fmr_op_unmap_safe, |
328 | .ro_recover_mr = fmr_op_recover_mr, | ||
355 | .ro_open = fmr_op_open, | 329 | .ro_open = fmr_op_open, |
356 | .ro_maxpages = fmr_op_maxpages, | 330 | .ro_maxpages = fmr_op_maxpages, |
357 | .ro_init = fmr_op_init, | 331 | .ro_init_mr = fmr_op_init_mr, |
358 | .ro_destroy = fmr_op_destroy, | 332 | .ro_release_mr = fmr_op_release_mr, |
359 | .ro_displayname = "fmr", | 333 | .ro_displayname = "fmr", |
360 | }; | 334 | }; |
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c0947544babe..892b5e1d9b09 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c | |||
@@ -73,29 +73,71 @@ | |||
73 | # define RPCDBG_FACILITY RPCDBG_TRANS | 73 | # define RPCDBG_FACILITY RPCDBG_TRANS |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | static struct workqueue_struct *frwr_recovery_wq; | 76 | bool |
77 | 77 | frwr_is_supported(struct rpcrdma_ia *ia) | |
78 | #define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) | 78 | { |
79 | struct ib_device_attr *attrs = &ia->ri_device->attrs; | ||
80 | |||
81 | if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) | ||
82 | goto out_not_supported; | ||
83 | if (attrs->max_fast_reg_page_list_len == 0) | ||
84 | goto out_not_supported; | ||
85 | return true; | ||
86 | |||
87 | out_not_supported: | ||
88 | pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", | ||
89 | ia->ri_device->name); | ||
90 | return false; | ||
91 | } | ||
79 | 92 | ||
80 | int | 93 | static int |
81 | frwr_alloc_recovery_wq(void) | 94 | frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) |
82 | { | 95 | { |
83 | frwr_recovery_wq = alloc_workqueue("frwr_recovery", | 96 | unsigned int depth = ia->ri_max_frmr_depth; |
84 | FRWR_RECOVERY_WQ_FLAGS, 0); | 97 | struct rpcrdma_frmr *f = &r->frmr; |
85 | return !frwr_recovery_wq ? -ENOMEM : 0; | 98 | int rc; |
99 | |||
100 | f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); | ||
101 | if (IS_ERR(f->fr_mr)) | ||
102 | goto out_mr_err; | ||
103 | |||
104 | r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); | ||
105 | if (!r->mw_sg) | ||
106 | goto out_list_err; | ||
107 | |||
108 | sg_init_table(r->mw_sg, depth); | ||
109 | init_completion(&f->fr_linv_done); | ||
110 | return 0; | ||
111 | |||
112 | out_mr_err: | ||
113 | rc = PTR_ERR(f->fr_mr); | ||
114 | dprintk("RPC: %s: ib_alloc_mr status %i\n", | ||
115 | __func__, rc); | ||
116 | return rc; | ||
117 | |||
118 | out_list_err: | ||
119 | rc = -ENOMEM; | ||
120 | dprintk("RPC: %s: sg allocation failure\n", | ||
121 | __func__); | ||
122 | ib_dereg_mr(f->fr_mr); | ||
123 | return rc; | ||
86 | } | 124 | } |
87 | 125 | ||
88 | void | 126 | static void |
89 | frwr_destroy_recovery_wq(void) | 127 | frwr_op_release_mr(struct rpcrdma_mw *r) |
90 | { | 128 | { |
91 | struct workqueue_struct *wq; | 129 | int rc; |
92 | 130 | ||
93 | if (!frwr_recovery_wq) | 131 | /* Ensure MW is not on any rl_registered list */ |
94 | return; | 132 | if (!list_empty(&r->mw_list)) |
133 | list_del(&r->mw_list); | ||
95 | 134 | ||
96 | wq = frwr_recovery_wq; | 135 | rc = ib_dereg_mr(r->frmr.fr_mr); |
97 | frwr_recovery_wq = NULL; | 136 | if (rc) |
98 | destroy_workqueue(wq); | 137 | pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", |
138 | r, rc); | ||
139 | kfree(r->mw_sg); | ||
140 | kfree(r); | ||
99 | } | 141 | } |
100 | 142 | ||
101 | static int | 143 | static int |
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) | |||
124 | return 0; | 166 | return 0; |
125 | } | 167 | } |
126 | 168 | ||
127 | static void | 169 | /* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. |
128 | __frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) | ||
129 | { | ||
130 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
131 | struct rpcrdma_frmr *f = &mw->frmr; | ||
132 | int rc; | ||
133 | |||
134 | rc = __frwr_reset_mr(ia, mw); | ||
135 | ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir); | ||
136 | if (rc) | ||
137 | return; | ||
138 | |||
139 | rpcrdma_put_mw(r_xprt, mw); | ||
140 | } | ||
141 | |||
142 | /* Deferred reset of a single FRMR. Generate a fresh rkey by | ||
143 | * replacing the MR. | ||
144 | * | 170 | * |
145 | * There's no recovery if this fails. The FRMR is abandoned, but | 171 | * There's no recovery if this fails. The FRMR is abandoned, but |
146 | * remains in rb_all. It will be cleaned up when the transport is | 172 | * remains in rb_all. It will be cleaned up when the transport is |
147 | * destroyed. | 173 | * destroyed. |
148 | */ | 174 | */ |
149 | static void | 175 | static void |
150 | __frwr_recovery_worker(struct work_struct *work) | 176 | frwr_op_recover_mr(struct rpcrdma_mw *mw) |
151 | { | ||
152 | struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, | ||
153 | mw_work); | ||
154 | |||
155 | __frwr_reset_and_unmap(r->mw_xprt, r); | ||
156 | return; | ||
157 | } | ||
158 | |||
159 | /* A broken MR was discovered in a context that can't sleep. | ||
160 | * Defer recovery to the recovery worker. | ||
161 | */ | ||
162 | static void | ||
163 | __frwr_queue_recovery(struct rpcrdma_mw *r) | ||
164 | { | ||
165 | INIT_WORK(&r->mw_work, __frwr_recovery_worker); | ||
166 | queue_work(frwr_recovery_wq, &r->mw_work); | ||
167 | } | ||
168 | |||
169 | static int | ||
170 | __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, | ||
171 | unsigned int depth) | ||
172 | { | 177 | { |
173 | struct rpcrdma_frmr *f = &r->frmr; | 178 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; |
179 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
174 | int rc; | 180 | int rc; |
175 | 181 | ||
176 | f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); | 182 | rc = __frwr_reset_mr(ia, mw); |
177 | if (IS_ERR(f->fr_mr)) | 183 | ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); |
178 | goto out_mr_err; | 184 | if (rc) |
179 | 185 | goto out_release; | |
180 | f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL); | ||
181 | if (!f->fr_sg) | ||
182 | goto out_list_err; | ||
183 | |||
184 | sg_init_table(f->fr_sg, depth); | ||
185 | |||
186 | init_completion(&f->fr_linv_done); | ||
187 | |||
188 | return 0; | ||
189 | 186 | ||
190 | out_mr_err: | 187 | rpcrdma_put_mw(r_xprt, mw); |
191 | rc = PTR_ERR(f->fr_mr); | 188 | r_xprt->rx_stats.mrs_recovered++; |
192 | dprintk("RPC: %s: ib_alloc_mr status %i\n", | 189 | return; |
193 | __func__, rc); | ||
194 | return rc; | ||
195 | 190 | ||
196 | out_list_err: | 191 | out_release: |
197 | rc = -ENOMEM; | 192 | pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); |
198 | dprintk("RPC: %s: sg allocation failure\n", | 193 | r_xprt->rx_stats.mrs_orphaned++; |
199 | __func__); | ||
200 | ib_dereg_mr(f->fr_mr); | ||
201 | return rc; | ||
202 | } | ||
203 | 194 | ||
204 | static void | 195 | spin_lock(&r_xprt->rx_buf.rb_mwlock); |
205 | __frwr_release(struct rpcrdma_mw *r) | 196 | list_del(&mw->mw_all); |
206 | { | 197 | spin_unlock(&r_xprt->rx_buf.rb_mwlock); |
207 | int rc; | ||
208 | 198 | ||
209 | rc = ib_dereg_mr(r->frmr.fr_mr); | 199 | frwr_op_release_mr(mw); |
210 | if (rc) | ||
211 | dprintk("RPC: %s: ib_dereg_mr status %i\n", | ||
212 | __func__, rc); | ||
213 | kfree(r->frmr.fr_sg); | ||
214 | } | 200 | } |
215 | 201 | ||
216 | static int | 202 | static int |
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) | |||
346 | complete_all(&frmr->fr_linv_done); | 332 | complete_all(&frmr->fr_linv_done); |
347 | } | 333 | } |
348 | 334 | ||
349 | static int | 335 | /* Post a REG_MR Work Request to register a memory region |
350 | frwr_op_init(struct rpcrdma_xprt *r_xprt) | ||
351 | { | ||
352 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
353 | struct ib_device *device = r_xprt->rx_ia.ri_device; | ||
354 | unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; | ||
355 | struct ib_pd *pd = r_xprt->rx_ia.ri_pd; | ||
356 | int i; | ||
357 | |||
358 | spin_lock_init(&buf->rb_mwlock); | ||
359 | INIT_LIST_HEAD(&buf->rb_mws); | ||
360 | INIT_LIST_HEAD(&buf->rb_all); | ||
361 | |||
362 | i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); | ||
363 | i += 2; /* head + tail */ | ||
364 | i *= buf->rb_max_requests; /* one set for each RPC slot */ | ||
365 | dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); | ||
366 | |||
367 | while (i--) { | ||
368 | struct rpcrdma_mw *r; | ||
369 | int rc; | ||
370 | |||
371 | r = kzalloc(sizeof(*r), GFP_KERNEL); | ||
372 | if (!r) | ||
373 | return -ENOMEM; | ||
374 | |||
375 | rc = __frwr_init(r, pd, device, depth); | ||
376 | if (rc) { | ||
377 | kfree(r); | ||
378 | return rc; | ||
379 | } | ||
380 | |||
381 | r->mw_xprt = r_xprt; | ||
382 | list_add(&r->mw_list, &buf->rb_mws); | ||
383 | list_add(&r->mw_all, &buf->rb_all); | ||
384 | } | ||
385 | |||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | /* Post a FAST_REG Work Request to register a memory region | ||
390 | * for remote access via RDMA READ or RDMA WRITE. | 336 | * for remote access via RDMA READ or RDMA WRITE. |
391 | */ | 337 | */ |
392 | static int | 338 | static int |
393 | frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | 339 | frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, |
394 | int nsegs, bool writing) | 340 | int nsegs, bool writing, struct rpcrdma_mw **out) |
395 | { | 341 | { |
396 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | 342 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
397 | struct ib_device *device = ia->ri_device; | ||
398 | enum dma_data_direction direction = rpcrdma_data_dir(writing); | ||
399 | struct rpcrdma_mr_seg *seg1 = seg; | ||
400 | struct rpcrdma_mw *mw; | 343 | struct rpcrdma_mw *mw; |
401 | struct rpcrdma_frmr *frmr; | 344 | struct rpcrdma_frmr *frmr; |
402 | struct ib_mr *mr; | 345 | struct ib_mr *mr; |
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
405 | int rc, i, n, dma_nents; | 348 | int rc, i, n, dma_nents; |
406 | u8 key; | 349 | u8 key; |
407 | 350 | ||
408 | mw = seg1->rl_mw; | 351 | mw = NULL; |
409 | seg1->rl_mw = NULL; | ||
410 | do { | 352 | do { |
411 | if (mw) | 353 | if (mw) |
412 | __frwr_queue_recovery(mw); | 354 | rpcrdma_defer_mr_recovery(mw); |
413 | mw = rpcrdma_get_mw(r_xprt); | 355 | mw = rpcrdma_get_mw(r_xprt); |
414 | if (!mw) | 356 | if (!mw) |
415 | return -ENOMEM; | 357 | return -ENOBUFS; |
416 | } while (mw->frmr.fr_state != FRMR_IS_INVALID); | 358 | } while (mw->frmr.fr_state != FRMR_IS_INVALID); |
417 | frmr = &mw->frmr; | 359 | frmr = &mw->frmr; |
418 | frmr->fr_state = FRMR_IS_VALID; | 360 | frmr->fr_state = FRMR_IS_VALID; |
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
421 | 363 | ||
422 | if (nsegs > ia->ri_max_frmr_depth) | 364 | if (nsegs > ia->ri_max_frmr_depth) |
423 | nsegs = ia->ri_max_frmr_depth; | 365 | nsegs = ia->ri_max_frmr_depth; |
424 | |||
425 | for (i = 0; i < nsegs;) { | 366 | for (i = 0; i < nsegs;) { |
426 | if (seg->mr_page) | 367 | if (seg->mr_page) |
427 | sg_set_page(&frmr->fr_sg[i], | 368 | sg_set_page(&mw->mw_sg[i], |
428 | seg->mr_page, | 369 | seg->mr_page, |
429 | seg->mr_len, | 370 | seg->mr_len, |
430 | offset_in_page(seg->mr_offset)); | 371 | offset_in_page(seg->mr_offset)); |
431 | else | 372 | else |
432 | sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, | 373 | sg_set_buf(&mw->mw_sg[i], seg->mr_offset, |
433 | seg->mr_len); | 374 | seg->mr_len); |
434 | 375 | ||
435 | ++seg; | 376 | ++seg; |
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
440 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) | 381 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) |
441 | break; | 382 | break; |
442 | } | 383 | } |
443 | frmr->fr_nents = i; | 384 | mw->mw_nents = i; |
444 | frmr->fr_dir = direction; | 385 | mw->mw_dir = rpcrdma_data_dir(writing); |
445 | 386 | if (i == 0) | |
446 | dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); | 387 | goto out_dmamap_err; |
447 | if (!dma_nents) { | ||
448 | pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", | ||
449 | __func__, frmr->fr_sg, frmr->fr_nents); | ||
450 | return -ENOMEM; | ||
451 | } | ||
452 | 388 | ||
453 | n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); | 389 | dma_nents = ib_dma_map_sg(ia->ri_device, |
454 | if (unlikely(n != frmr->fr_nents)) { | 390 | mw->mw_sg, mw->mw_nents, mw->mw_dir); |
455 | pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", | 391 | if (!dma_nents) |
456 | __func__, frmr->fr_mr, n, frmr->fr_nents); | 392 | goto out_dmamap_err; |
457 | rc = n < 0 ? n : -EINVAL; | 393 | |
458 | goto out_senderr; | 394 | n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); |
459 | } | 395 | if (unlikely(n != mw->mw_nents)) |
396 | goto out_mapmr_err; | ||
460 | 397 | ||
461 | dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", | 398 | dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", |
462 | __func__, mw, frmr->fr_nents, mr->length); | 399 | __func__, mw, mw->mw_nents, mr->length); |
463 | 400 | ||
464 | key = (u8)(mr->rkey & 0x000000FF); | 401 | key = (u8)(mr->rkey & 0x000000FF); |
465 | ib_update_fast_reg_key(mr, ++key); | 402 | ib_update_fast_reg_key(mr, ++key); |
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | |||
481 | if (rc) | 418 | if (rc) |
482 | goto out_senderr; | 419 | goto out_senderr; |
483 | 420 | ||
484 | seg1->rl_mw = mw; | 421 | mw->mw_handle = mr->rkey; |
485 | seg1->mr_rkey = mr->rkey; | 422 | mw->mw_length = mr->length; |
486 | seg1->mr_base = mr->iova; | 423 | mw->mw_offset = mr->iova; |
487 | seg1->mr_nsegs = frmr->fr_nents; | 424 | |
488 | seg1->mr_len = mr->length; | 425 | *out = mw; |
426 | return mw->mw_nents; | ||
489 | 427 | ||
490 | return frmr->fr_nents; | 428 | out_dmamap_err: |
429 | pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", | ||
430 | mw->mw_sg, mw->mw_nents); | ||
431 | rpcrdma_defer_mr_recovery(mw); | ||
432 | return -EIO; | ||
433 | |||
434 | out_mapmr_err: | ||
435 | pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", | ||
436 | frmr->fr_mr, n, mw->mw_nents); | ||
437 | rpcrdma_defer_mr_recovery(mw); | ||
438 | return -EIO; | ||
491 | 439 | ||
492 | out_senderr: | 440 | out_senderr: |
493 | dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); | 441 | pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); |
494 | __frwr_queue_recovery(mw); | 442 | rpcrdma_defer_mr_recovery(mw); |
495 | return rc; | 443 | return -ENOTCONN; |
496 | } | 444 | } |
497 | 445 | ||
498 | static struct ib_send_wr * | 446 | static struct ib_send_wr * |
499 | __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) | 447 | __frwr_prepare_linv_wr(struct rpcrdma_mw *mw) |
500 | { | 448 | { |
501 | struct rpcrdma_mw *mw = seg->rl_mw; | ||
502 | struct rpcrdma_frmr *f = &mw->frmr; | 449 | struct rpcrdma_frmr *f = &mw->frmr; |
503 | struct ib_send_wr *invalidate_wr; | 450 | struct ib_send_wr *invalidate_wr; |
504 | 451 | ||
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) | |||
518 | * | 465 | * |
519 | * Sleeps until it is safe for the host CPU to access the | 466 | * Sleeps until it is safe for the host CPU to access the |
520 | * previously mapped memory regions. | 467 | * previously mapped memory regions. |
468 | * | ||
469 | * Caller ensures that req->rl_registered is not empty. | ||
521 | */ | 470 | */ |
522 | static void | 471 | static void |
523 | frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | 472 | frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
524 | { | 473 | { |
525 | struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; | 474 | struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; |
526 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | 475 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
527 | struct rpcrdma_mr_seg *seg; | 476 | struct rpcrdma_mw *mw, *tmp; |
528 | unsigned int i, nchunks; | ||
529 | struct rpcrdma_frmr *f; | 477 | struct rpcrdma_frmr *f; |
530 | struct rpcrdma_mw *mw; | ||
531 | int rc; | 478 | int rc; |
532 | 479 | ||
533 | dprintk("RPC: %s: req %p\n", __func__, req); | 480 | dprintk("RPC: %s: req %p\n", __func__, req); |
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |||
537 | * Chain the LOCAL_INV Work Requests and post them with | 484 | * Chain the LOCAL_INV Work Requests and post them with |
538 | * a single ib_post_send() call. | 485 | * a single ib_post_send() call. |
539 | */ | 486 | */ |
487 | f = NULL; | ||
540 | invalidate_wrs = pos = prev = NULL; | 488 | invalidate_wrs = pos = prev = NULL; |
541 | seg = NULL; | 489 | list_for_each_entry(mw, &req->rl_registered, mw_list) { |
542 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 490 | pos = __frwr_prepare_linv_wr(mw); |
543 | seg = &req->rl_segments[i]; | ||
544 | |||
545 | pos = __frwr_prepare_linv_wr(seg); | ||
546 | 491 | ||
547 | if (!invalidate_wrs) | 492 | if (!invalidate_wrs) |
548 | invalidate_wrs = pos; | 493 | invalidate_wrs = pos; |
549 | else | 494 | else |
550 | prev->next = pos; | 495 | prev->next = pos; |
551 | prev = pos; | 496 | prev = pos; |
552 | 497 | f = &mw->frmr; | |
553 | i += seg->mr_nsegs; | ||
554 | } | 498 | } |
555 | f = &seg->rl_mw->frmr; | ||
556 | 499 | ||
557 | /* Strong send queue ordering guarantees that when the | 500 | /* Strong send queue ordering guarantees that when the |
558 | * last WR in the chain completes, all WRs in the chain | 501 | * last WR in the chain completes, all WRs in the chain |
@@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | |||
577 | * them to the free MW list. | 520 | * them to the free MW list. |
578 | */ | 521 | */ |
579 | unmap: | 522 | unmap: |
580 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 523 | list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { |
581 | seg = &req->rl_segments[i]; | 524 | list_del_init(&mw->mw_list); |
582 | mw = seg->rl_mw; | 525 | ib_dma_unmap_sg(ia->ri_device, |
583 | seg->rl_mw = NULL; | 526 | mw->mw_sg, mw->mw_nents, mw->mw_dir); |
584 | |||
585 | ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, | ||
586 | f->fr_dir); | ||
587 | rpcrdma_put_mw(r_xprt, mw); | 527 | rpcrdma_put_mw(r_xprt, mw); |
588 | |||
589 | i += seg->mr_nsegs; | ||
590 | seg->mr_nsegs = 0; | ||
591 | } | 528 | } |
592 | |||
593 | req->rl_nchunks = 0; | ||
594 | return; | 529 | return; |
595 | 530 | ||
596 | reset_mrs: | 531 | reset_mrs: |
597 | pr_warn("%s: ib_post_send failed %i\n", __func__, rc); | 532 | pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); |
533 | rdma_disconnect(ia->ri_id); | ||
598 | 534 | ||
599 | /* Find and reset the MRs in the LOCAL_INV WRs that did not | 535 | /* Find and reset the MRs in the LOCAL_INV WRs that did not |
600 | * get posted. This is synchronous, and slow. | 536 | * get posted. This is synchronous, and slow. |
601 | */ | 537 | */ |
602 | for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { | 538 | list_for_each_entry(mw, &req->rl_registered, mw_list) { |
603 | seg = &req->rl_segments[i]; | ||
604 | mw = seg->rl_mw; | ||
605 | f = &mw->frmr; | 539 | f = &mw->frmr; |
606 | |||
607 | if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { | 540 | if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { |
608 | __frwr_reset_mr(ia, mw); | 541 | __frwr_reset_mr(ia, mw); |
609 | bad_wr = bad_wr->next; | 542 | bad_wr = bad_wr->next; |
610 | } | 543 | } |
611 | |||
612 | i += seg->mr_nsegs; | ||
613 | } | 544 | } |
614 | goto unmap; | 545 | goto unmap; |
615 | } | 546 | } |
@@ -621,38 +552,17 @@ static void | |||
621 | frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | 552 | frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, |
622 | bool sync) | 553 | bool sync) |
623 | { | 554 | { |
624 | struct rpcrdma_mr_seg *seg; | ||
625 | struct rpcrdma_mw *mw; | 555 | struct rpcrdma_mw *mw; |
626 | unsigned int i; | ||
627 | 556 | ||
628 | for (i = 0; req->rl_nchunks; req->rl_nchunks--) { | 557 | while (!list_empty(&req->rl_registered)) { |
629 | seg = &req->rl_segments[i]; | 558 | mw = list_first_entry(&req->rl_registered, |
630 | mw = seg->rl_mw; | 559 | struct rpcrdma_mw, mw_list); |
560 | list_del_init(&mw->mw_list); | ||
631 | 561 | ||
632 | if (sync) | 562 | if (sync) |
633 | __frwr_reset_and_unmap(r_xprt, mw); | 563 | frwr_op_recover_mr(mw); |
634 | else | 564 | else |
635 | __frwr_queue_recovery(mw); | 565 | rpcrdma_defer_mr_recovery(mw); |
636 | |||
637 | i += seg->mr_nsegs; | ||
638 | seg->mr_nsegs = 0; | ||
639 | seg->rl_mw = NULL; | ||
640 | } | ||
641 | } | ||
642 | |||
643 | static void | ||
644 | frwr_op_destroy(struct rpcrdma_buffer *buf) | ||
645 | { | ||
646 | struct rpcrdma_mw *r; | ||
647 | |||
648 | /* Ensure stale MWs for "buf" are no longer in flight */ | ||
649 | flush_workqueue(frwr_recovery_wq); | ||
650 | |||
651 | while (!list_empty(&buf->rb_all)) { | ||
652 | r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); | ||
653 | list_del(&r->mw_all); | ||
654 | __frwr_release(r); | ||
655 | kfree(r); | ||
656 | } | 566 | } |
657 | } | 567 | } |
658 | 568 | ||
@@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { | |||
660 | .ro_map = frwr_op_map, | 570 | .ro_map = frwr_op_map, |
661 | .ro_unmap_sync = frwr_op_unmap_sync, | 571 | .ro_unmap_sync = frwr_op_unmap_sync, |
662 | .ro_unmap_safe = frwr_op_unmap_safe, | 572 | .ro_unmap_safe = frwr_op_unmap_safe, |
573 | .ro_recover_mr = frwr_op_recover_mr, | ||
663 | .ro_open = frwr_op_open, | 574 | .ro_open = frwr_op_open, |
664 | .ro_maxpages = frwr_op_maxpages, | 575 | .ro_maxpages = frwr_op_maxpages, |
665 | .ro_init = frwr_op_init, | 576 | .ro_init_mr = frwr_op_init_mr, |
666 | .ro_destroy = frwr_op_destroy, | 577 | .ro_release_mr = frwr_op_release_mr, |
667 | .ro_displayname = "frwr", | 578 | .ro_displayname = "frwr", |
668 | }; | 579 | }; |
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c deleted file mode 100644 index 3750596cc432..000000000000 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ /dev/null | |||
@@ -1,122 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2015 Oracle. All rights reserved. | ||
3 | * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. | ||
4 | */ | ||
5 | |||
6 | /* No-op chunk preparation. All client memory is pre-registered. | ||
7 | * Sometimes referred to as ALLPHYSICAL mode. | ||
8 | * | ||
9 | * Physical registration is simple because all client memory is | ||
10 | * pre-registered and never deregistered. This mode is good for | ||
11 | * adapter bring up, but is considered not safe: the server is | ||
12 | * trusted not to abuse its access to client memory not involved | ||
13 | * in RDMA I/O. | ||
14 | */ | ||
15 | |||
16 | #include "xprt_rdma.h" | ||
17 | |||
18 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) | ||
19 | # define RPCDBG_FACILITY RPCDBG_TRANS | ||
20 | #endif | ||
21 | |||
22 | static int | ||
23 | physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, | ||
24 | struct rpcrdma_create_data_internal *cdata) | ||
25 | { | ||
26 | struct ib_mr *mr; | ||
27 | |||
28 | /* Obtain an rkey to use for RPC data payloads. | ||
29 | */ | ||
30 | mr = ib_get_dma_mr(ia->ri_pd, | ||
31 | IB_ACCESS_LOCAL_WRITE | | ||
32 | IB_ACCESS_REMOTE_WRITE | | ||
33 | IB_ACCESS_REMOTE_READ); | ||
34 | if (IS_ERR(mr)) { | ||
35 | pr_err("%s: ib_get_dma_mr for failed with %lX\n", | ||
36 | __func__, PTR_ERR(mr)); | ||
37 | return -ENOMEM; | ||
38 | } | ||
39 | ia->ri_dma_mr = mr; | ||
40 | |||
41 | rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int, | ||
42 | RPCRDMA_MAX_DATA_SEGS, | ||
43 | RPCRDMA_MAX_HDR_SEGS)); | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | /* PHYSICAL memory registration conveys one page per chunk segment. | ||
48 | */ | ||
49 | static size_t | ||
50 | physical_op_maxpages(struct rpcrdma_xprt *r_xprt) | ||
51 | { | ||
52 | return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, | ||
53 | RPCRDMA_MAX_HDR_SEGS); | ||
54 | } | ||
55 | |||
56 | static int | ||
57 | physical_op_init(struct rpcrdma_xprt *r_xprt) | ||
58 | { | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | /* The client's physical memory is already exposed for | ||
63 | * remote access via RDMA READ or RDMA WRITE. | ||
64 | */ | ||
65 | static int | ||
66 | physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, | ||
67 | int nsegs, bool writing) | ||
68 | { | ||
69 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
70 | |||
71 | rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); | ||
72 | seg->mr_rkey = ia->ri_dma_mr->rkey; | ||
73 | seg->mr_base = seg->mr_dma; | ||
74 | return 1; | ||
75 | } | ||
76 | |||
77 | /* DMA unmap all memory regions that were mapped for "req". | ||
78 | */ | ||
79 | static void | ||
80 | physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) | ||
81 | { | ||
82 | struct ib_device *device = r_xprt->rx_ia.ri_device; | ||
83 | unsigned int i; | ||
84 | |||
85 | for (i = 0; req->rl_nchunks; --req->rl_nchunks) | ||
86 | rpcrdma_unmap_one(device, &req->rl_segments[i++]); | ||
87 | } | ||
88 | |||
89 | /* Use a slow, safe mechanism to invalidate all memory regions | ||
90 | * that were registered for "req". | ||
91 | * | ||
92 | * For physical memory registration, there is no good way to | ||
93 | * fence a single MR that has been advertised to the server. The | ||
94 | * client has already handed the server an R_key that cannot be | ||
95 | * invalidated and is shared by all MRs on this connection. | ||
96 | * Tearing down the PD might be the only safe choice, but it's | ||
97 | * not clear that a freshly acquired DMA R_key would be different | ||
98 | * than the one used by the PD that was just destroyed. | ||
99 | * FIXME. | ||
100 | */ | ||
101 | static void | ||
102 | physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | ||
103 | bool sync) | ||
104 | { | ||
105 | physical_op_unmap_sync(r_xprt, req); | ||
106 | } | ||
107 | |||
108 | static void | ||
109 | physical_op_destroy(struct rpcrdma_buffer *buf) | ||
110 | { | ||
111 | } | ||
112 | |||
113 | const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { | ||
114 | .ro_map = physical_op_map, | ||
115 | .ro_unmap_sync = physical_op_unmap_sync, | ||
116 | .ro_unmap_safe = physical_op_unmap_safe, | ||
117 | .ro_open = physical_op_open, | ||
118 | .ro_maxpages = physical_op_maxpages, | ||
119 | .ro_init = physical_op_init, | ||
120 | .ro_destroy = physical_op_destroy, | ||
121 | .ro_displayname = "physical", | ||
122 | }; | ||
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 35a81096e83d..a47f170b20ef 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c | |||
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf) | |||
196 | * MR when they can. | 196 | * MR when they can. |
197 | */ | 197 | */ |
198 | static int | 198 | static int |
199 | rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, | 199 | rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) |
200 | int n, int nsegs) | ||
201 | { | 200 | { |
202 | size_t page_offset; | 201 | size_t page_offset; |
203 | u32 remaining; | 202 | u32 remaining; |
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, | |||
206 | base = vec->iov_base; | 205 | base = vec->iov_base; |
207 | page_offset = offset_in_page(base); | 206 | page_offset = offset_in_page(base); |
208 | remaining = vec->iov_len; | 207 | remaining = vec->iov_len; |
209 | while (remaining && n < nsegs) { | 208 | while (remaining && n < RPCRDMA_MAX_SEGS) { |
210 | seg[n].mr_page = NULL; | 209 | seg[n].mr_page = NULL; |
211 | seg[n].mr_offset = base; | 210 | seg[n].mr_offset = base; |
212 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); | 211 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); |
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, | |||
230 | 229 | ||
231 | static int | 230 | static int |
232 | rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | 231 | rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, |
233 | enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) | 232 | enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) |
234 | { | 233 | { |
235 | int len, n = 0, p; | 234 | int len, n, p, page_base; |
236 | int page_base; | ||
237 | struct page **ppages; | 235 | struct page **ppages; |
238 | 236 | ||
237 | n = 0; | ||
239 | if (pos == 0) { | 238 | if (pos == 0) { |
240 | n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); | 239 | n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); |
241 | if (n == nsegs) | 240 | if (n == RPCRDMA_MAX_SEGS) |
242 | return -EIO; | 241 | goto out_overflow; |
243 | } | 242 | } |
244 | 243 | ||
245 | len = xdrbuf->page_len; | 244 | len = xdrbuf->page_len; |
246 | ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); | 245 | ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); |
247 | page_base = xdrbuf->page_base & ~PAGE_MASK; | 246 | page_base = xdrbuf->page_base & ~PAGE_MASK; |
248 | p = 0; | 247 | p = 0; |
249 | while (len && n < nsegs) { | 248 | while (len && n < RPCRDMA_MAX_SEGS) { |
250 | if (!ppages[p]) { | 249 | if (!ppages[p]) { |
251 | /* alloc the pagelist for receiving buffer */ | 250 | /* alloc the pagelist for receiving buffer */ |
252 | ppages[p] = alloc_page(GFP_ATOMIC); | 251 | ppages[p] = alloc_page(GFP_ATOMIC); |
253 | if (!ppages[p]) | 252 | if (!ppages[p]) |
254 | return -ENOMEM; | 253 | return -EAGAIN; |
255 | } | 254 | } |
256 | seg[n].mr_page = ppages[p]; | 255 | seg[n].mr_page = ppages[p]; |
257 | seg[n].mr_offset = (void *)(unsigned long) page_base; | 256 | seg[n].mr_offset = (void *)(unsigned long) page_base; |
258 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); | 257 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); |
259 | if (seg[n].mr_len > PAGE_SIZE) | 258 | if (seg[n].mr_len > PAGE_SIZE) |
260 | return -EIO; | 259 | goto out_overflow; |
261 | len -= seg[n].mr_len; | 260 | len -= seg[n].mr_len; |
262 | ++n; | 261 | ++n; |
263 | ++p; | 262 | ++p; |
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
265 | } | 264 | } |
266 | 265 | ||
267 | /* Message overflows the seg array */ | 266 | /* Message overflows the seg array */ |
268 | if (len && n == nsegs) | 267 | if (len && n == RPCRDMA_MAX_SEGS) |
269 | return -EIO; | 268 | goto out_overflow; |
270 | 269 | ||
271 | /* When encoding the read list, the tail is always sent inline */ | 270 | /* When encoding the read list, the tail is always sent inline */ |
272 | if (type == rpcrdma_readch) | 271 | if (type == rpcrdma_readch) |
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, | |||
277 | * xdr pad bytes, saving the server an RDMA operation. */ | 276 | * xdr pad bytes, saving the server an RDMA operation. */ |
278 | if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) | 277 | if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) |
279 | return n; | 278 | return n; |
280 | n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); | 279 | n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); |
281 | if (n == nsegs) | 280 | if (n == RPCRDMA_MAX_SEGS) |
282 | return -EIO; | 281 | goto out_overflow; |
283 | } | 282 | } |
284 | 283 | ||
285 | return n; | 284 | return n; |
285 | |||
286 | out_overflow: | ||
287 | pr_err("rpcrdma: segment array overflow\n"); | ||
288 | return -EIO; | ||
286 | } | 289 | } |
287 | 290 | ||
288 | static inline __be32 * | 291 | static inline __be32 * |
289 | xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) | 292 | xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) |
290 | { | 293 | { |
291 | *iptr++ = cpu_to_be32(seg->mr_rkey); | 294 | *iptr++ = cpu_to_be32(mw->mw_handle); |
292 | *iptr++ = cpu_to_be32(seg->mr_len); | 295 | *iptr++ = cpu_to_be32(mw->mw_length); |
293 | return xdr_encode_hyper(iptr, seg->mr_base); | 296 | return xdr_encode_hyper(iptr, mw->mw_offset); |
294 | } | 297 | } |
295 | 298 | ||
296 | /* XDR-encode the Read list. Supports encoding a list of read | 299 | /* XDR-encode the Read list. Supports encoding a list of read |
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, | |||
310 | struct rpcrdma_req *req, struct rpc_rqst *rqst, | 313 | struct rpcrdma_req *req, struct rpc_rqst *rqst, |
311 | __be32 *iptr, enum rpcrdma_chunktype rtype) | 314 | __be32 *iptr, enum rpcrdma_chunktype rtype) |
312 | { | 315 | { |
313 | struct rpcrdma_mr_seg *seg = req->rl_nextseg; | 316 | struct rpcrdma_mr_seg *seg; |
317 | struct rpcrdma_mw *mw; | ||
314 | unsigned int pos; | 318 | unsigned int pos; |
315 | int n, nsegs; | 319 | int n, nsegs; |
316 | 320 | ||
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, | |||
322 | pos = rqst->rq_snd_buf.head[0].iov_len; | 326 | pos = rqst->rq_snd_buf.head[0].iov_len; |
323 | if (rtype == rpcrdma_areadch) | 327 | if (rtype == rpcrdma_areadch) |
324 | pos = 0; | 328 | pos = 0; |
325 | nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, | 329 | seg = req->rl_segments; |
326 | RPCRDMA_MAX_SEGS - req->rl_nchunks); | 330 | nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); |
327 | if (nsegs < 0) | 331 | if (nsegs < 0) |
328 | return ERR_PTR(nsegs); | 332 | return ERR_PTR(nsegs); |
329 | 333 | ||
330 | do { | 334 | do { |
331 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); | 335 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
332 | if (n <= 0) | 336 | false, &mw); |
337 | if (n < 0) | ||
333 | return ERR_PTR(n); | 338 | return ERR_PTR(n); |
339 | list_add(&mw->mw_list, &req->rl_registered); | ||
334 | 340 | ||
335 | *iptr++ = xdr_one; /* item present */ | 341 | *iptr++ = xdr_one; /* item present */ |
336 | 342 | ||
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, | |||
338 | * have the same "position". | 344 | * have the same "position". |
339 | */ | 345 | */ |
340 | *iptr++ = cpu_to_be32(pos); | 346 | *iptr++ = cpu_to_be32(pos); |
341 | iptr = xdr_encode_rdma_segment(iptr, seg); | 347 | iptr = xdr_encode_rdma_segment(iptr, mw); |
342 | 348 | ||
343 | dprintk("RPC: %5u %s: read segment pos %u " | 349 | dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", |
344 | "%d@0x%016llx:0x%08x (%s)\n", | ||
345 | rqst->rq_task->tk_pid, __func__, pos, | 350 | rqst->rq_task->tk_pid, __func__, pos, |
346 | seg->mr_len, (unsigned long long)seg->mr_base, | 351 | mw->mw_length, (unsigned long long)mw->mw_offset, |
347 | seg->mr_rkey, n < nsegs ? "more" : "last"); | 352 | mw->mw_handle, n < nsegs ? "more" : "last"); |
348 | 353 | ||
349 | r_xprt->rx_stats.read_chunk_count++; | 354 | r_xprt->rx_stats.read_chunk_count++; |
350 | req->rl_nchunks++; | ||
351 | seg += n; | 355 | seg += n; |
352 | nsegs -= n; | 356 | nsegs -= n; |
353 | } while (nsegs); | 357 | } while (nsegs); |
354 | req->rl_nextseg = seg; | ||
355 | 358 | ||
356 | /* Finish Read list */ | 359 | /* Finish Read list */ |
357 | *iptr++ = xdr_zero; /* Next item not present */ | 360 | *iptr++ = xdr_zero; /* Next item not present */ |
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | |||
375 | struct rpc_rqst *rqst, __be32 *iptr, | 378 | struct rpc_rqst *rqst, __be32 *iptr, |
376 | enum rpcrdma_chunktype wtype) | 379 | enum rpcrdma_chunktype wtype) |
377 | { | 380 | { |
378 | struct rpcrdma_mr_seg *seg = req->rl_nextseg; | 381 | struct rpcrdma_mr_seg *seg; |
382 | struct rpcrdma_mw *mw; | ||
379 | int n, nsegs, nchunks; | 383 | int n, nsegs, nchunks; |
380 | __be32 *segcount; | 384 | __be32 *segcount; |
381 | 385 | ||
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | |||
384 | return iptr; | 388 | return iptr; |
385 | } | 389 | } |
386 | 390 | ||
391 | seg = req->rl_segments; | ||
387 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, | 392 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, |
388 | rqst->rq_rcv_buf.head[0].iov_len, | 393 | rqst->rq_rcv_buf.head[0].iov_len, |
389 | wtype, seg, | 394 | wtype, seg); |
390 | RPCRDMA_MAX_SEGS - req->rl_nchunks); | ||
391 | if (nsegs < 0) | 395 | if (nsegs < 0) |
392 | return ERR_PTR(nsegs); | 396 | return ERR_PTR(nsegs); |
393 | 397 | ||
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | |||
396 | 400 | ||
397 | nchunks = 0; | 401 | nchunks = 0; |
398 | do { | 402 | do { |
399 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); | 403 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
400 | if (n <= 0) | 404 | true, &mw); |
405 | if (n < 0) | ||
401 | return ERR_PTR(n); | 406 | return ERR_PTR(n); |
407 | list_add(&mw->mw_list, &req->rl_registered); | ||
402 | 408 | ||
403 | iptr = xdr_encode_rdma_segment(iptr, seg); | 409 | iptr = xdr_encode_rdma_segment(iptr, mw); |
404 | 410 | ||
405 | dprintk("RPC: %5u %s: write segment " | 411 | dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", |
406 | "%d@0x016%llx:0x%08x (%s)\n", | ||
407 | rqst->rq_task->tk_pid, __func__, | 412 | rqst->rq_task->tk_pid, __func__, |
408 | seg->mr_len, (unsigned long long)seg->mr_base, | 413 | mw->mw_length, (unsigned long long)mw->mw_offset, |
409 | seg->mr_rkey, n < nsegs ? "more" : "last"); | 414 | mw->mw_handle, n < nsegs ? "more" : "last"); |
410 | 415 | ||
411 | r_xprt->rx_stats.write_chunk_count++; | 416 | r_xprt->rx_stats.write_chunk_count++; |
412 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; | 417 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; |
413 | req->rl_nchunks++; | ||
414 | nchunks++; | 418 | nchunks++; |
415 | seg += n; | 419 | seg += n; |
416 | nsegs -= n; | 420 | nsegs -= n; |
417 | } while (nsegs); | 421 | } while (nsegs); |
418 | req->rl_nextseg = seg; | ||
419 | 422 | ||
420 | /* Update count of segments in this Write chunk */ | 423 | /* Update count of segments in this Write chunk */ |
421 | *segcount = cpu_to_be32(nchunks); | 424 | *segcount = cpu_to_be32(nchunks); |
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, | |||
442 | struct rpcrdma_req *req, struct rpc_rqst *rqst, | 445 | struct rpcrdma_req *req, struct rpc_rqst *rqst, |
443 | __be32 *iptr, enum rpcrdma_chunktype wtype) | 446 | __be32 *iptr, enum rpcrdma_chunktype wtype) |
444 | { | 447 | { |
445 | struct rpcrdma_mr_seg *seg = req->rl_nextseg; | 448 | struct rpcrdma_mr_seg *seg; |
449 | struct rpcrdma_mw *mw; | ||
446 | int n, nsegs, nchunks; | 450 | int n, nsegs, nchunks; |
447 | __be32 *segcount; | 451 | __be32 *segcount; |
448 | 452 | ||
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, | |||
451 | return iptr; | 455 | return iptr; |
452 | } | 456 | } |
453 | 457 | ||
454 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, | 458 | seg = req->rl_segments; |
455 | RPCRDMA_MAX_SEGS - req->rl_nchunks); | 459 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); |
456 | if (nsegs < 0) | 460 | if (nsegs < 0) |
457 | return ERR_PTR(nsegs); | 461 | return ERR_PTR(nsegs); |
458 | 462 | ||
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, | |||
461 | 465 | ||
462 | nchunks = 0; | 466 | nchunks = 0; |
463 | do { | 467 | do { |
464 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); | 468 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
465 | if (n <= 0) | 469 | true, &mw); |
470 | if (n < 0) | ||
466 | return ERR_PTR(n); | 471 | return ERR_PTR(n); |
472 | list_add(&mw->mw_list, &req->rl_registered); | ||
467 | 473 | ||
468 | iptr = xdr_encode_rdma_segment(iptr, seg); | 474 | iptr = xdr_encode_rdma_segment(iptr, mw); |
469 | 475 | ||
470 | dprintk("RPC: %5u %s: reply segment " | 476 | dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", |
471 | "%d@0x%016llx:0x%08x (%s)\n", | ||
472 | rqst->rq_task->tk_pid, __func__, | 477 | rqst->rq_task->tk_pid, __func__, |
473 | seg->mr_len, (unsigned long long)seg->mr_base, | 478 | mw->mw_length, (unsigned long long)mw->mw_offset, |
474 | seg->mr_rkey, n < nsegs ? "more" : "last"); | 479 | mw->mw_handle, n < nsegs ? "more" : "last"); |
475 | 480 | ||
476 | r_xprt->rx_stats.reply_chunk_count++; | 481 | r_xprt->rx_stats.reply_chunk_count++; |
477 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; | 482 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; |
478 | req->rl_nchunks++; | ||
479 | nchunks++; | 483 | nchunks++; |
480 | seg += n; | 484 | seg += n; |
481 | nsegs -= n; | 485 | nsegs -= n; |
482 | } while (nsegs); | 486 | } while (nsegs); |
483 | req->rl_nextseg = seg; | ||
484 | 487 | ||
485 | /* Update count of segments in the Reply chunk */ | 488 | /* Update count of segments in the Reply chunk */ |
486 | *segcount = cpu_to_be32(nchunks); | 489 | *segcount = cpu_to_be32(nchunks); |
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
567 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); | 570 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); |
568 | enum rpcrdma_chunktype rtype, wtype; | 571 | enum rpcrdma_chunktype rtype, wtype; |
569 | struct rpcrdma_msg *headerp; | 572 | struct rpcrdma_msg *headerp; |
573 | bool ddp_allowed; | ||
570 | ssize_t hdrlen; | 574 | ssize_t hdrlen; |
571 | size_t rpclen; | 575 | size_t rpclen; |
572 | __be32 *iptr; | 576 | __be32 *iptr; |
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
583 | headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); | 587 | headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); |
584 | headerp->rm_type = rdma_msg; | 588 | headerp->rm_type = rdma_msg; |
585 | 589 | ||
590 | /* When the ULP employs a GSS flavor that guarantees integrity | ||
591 | * or privacy, direct data placement of individual data items | ||
592 | * is not allowed. | ||
593 | */ | ||
594 | ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & | ||
595 | RPCAUTH_AUTH_DATATOUCH); | ||
596 | |||
586 | /* | 597 | /* |
587 | * Chunks needed for results? | 598 | * Chunks needed for results? |
588 | * | 599 | * |
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
594 | */ | 605 | */ |
595 | if (rpcrdma_results_inline(r_xprt, rqst)) | 606 | if (rpcrdma_results_inline(r_xprt, rqst)) |
596 | wtype = rpcrdma_noch; | 607 | wtype = rpcrdma_noch; |
597 | else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) | 608 | else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) |
598 | wtype = rpcrdma_writech; | 609 | wtype = rpcrdma_writech; |
599 | else | 610 | else |
600 | wtype = rpcrdma_replych; | 611 | wtype = rpcrdma_replych; |
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
617 | rtype = rpcrdma_noch; | 628 | rtype = rpcrdma_noch; |
618 | rpcrdma_inline_pullup(rqst); | 629 | rpcrdma_inline_pullup(rqst); |
619 | rpclen = rqst->rq_svec[0].iov_len; | 630 | rpclen = rqst->rq_svec[0].iov_len; |
620 | } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { | 631 | } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { |
621 | rtype = rpcrdma_readch; | 632 | rtype = rpcrdma_readch; |
622 | rpclen = rqst->rq_svec[0].iov_len; | 633 | rpclen = rqst->rq_svec[0].iov_len; |
623 | rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); | 634 | rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); |
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
650 | * send a Call message with a Position Zero Read chunk and a | 661 | * send a Call message with a Position Zero Read chunk and a |
651 | * regular Read chunk at the same time. | 662 | * regular Read chunk at the same time. |
652 | */ | 663 | */ |
653 | req->rl_nchunks = 0; | ||
654 | req->rl_nextseg = req->rl_segments; | ||
655 | iptr = headerp->rm_body.rm_chunks; | 664 | iptr = headerp->rm_body.rm_chunks; |
656 | iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); | 665 | iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); |
657 | if (IS_ERR(iptr)) | 666 | if (IS_ERR(iptr)) |
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) | |||
690 | out_overflow: | 699 | out_overflow: |
691 | pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", | 700 | pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", |
692 | hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); | 701 | hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); |
693 | /* Terminate this RPC. Chunks registered above will be | 702 | iptr = ERR_PTR(-EIO); |
694 | * released by xprt_release -> xprt_rmda_free . | ||
695 | */ | ||
696 | return -EIO; | ||
697 | 703 | ||
698 | out_unmap: | 704 | out_unmap: |
699 | r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); | 705 | r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); |
@@ -705,15 +711,13 @@ out_unmap: | |||
705 | * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) | 711 | * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) |
706 | */ | 712 | */ |
707 | static int | 713 | static int |
708 | rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) | 714 | rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) |
709 | { | 715 | { |
710 | unsigned int i, total_len; | 716 | unsigned int i, total_len; |
711 | struct rpcrdma_write_chunk *cur_wchunk; | 717 | struct rpcrdma_write_chunk *cur_wchunk; |
712 | char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); | 718 | char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); |
713 | 719 | ||
714 | i = be32_to_cpu(**iptrp); | 720 | i = be32_to_cpu(**iptrp); |
715 | if (i > max) | ||
716 | return -1; | ||
717 | cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); | 721 | cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); |
718 | total_len = 0; | 722 | total_len = 0; |
719 | while (i--) { | 723 | while (i--) { |
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b | |||
744 | return total_len; | 748 | return total_len; |
745 | } | 749 | } |
746 | 750 | ||
747 | /* | 751 | /** |
748 | * Scatter inline received data back into provided iov's. | 752 | * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs |
753 | * @rqst: controlling RPC request | ||
754 | * @srcp: points to RPC message payload in receive buffer | ||
755 | * @copy_len: remaining length of receive buffer content | ||
756 | * @pad: Write chunk pad bytes needed (zero for pure inline) | ||
757 | * | ||
758 | * The upper layer has set the maximum number of bytes it can | ||
759 | * receive in each component of rq_rcv_buf. These values are set in | ||
760 | * the head.iov_len, page_len, tail.iov_len, and buflen fields. | ||
761 | * | ||
762 | * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in | ||
763 | * many cases this function simply updates iov_base pointers in | ||
764 | * rq_rcv_buf to point directly to the received reply data, to | ||
765 | * avoid copying reply data. | ||
766 | * | ||
767 | * Returns the count of bytes which had to be memcopied. | ||
749 | */ | 768 | */ |
750 | static void | 769 | static unsigned long |
751 | rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) | 770 | rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) |
752 | { | 771 | { |
753 | int i, npages, curlen, olen; | 772 | unsigned long fixup_copy_count; |
773 | int i, npages, curlen; | ||
754 | char *destp; | 774 | char *destp; |
755 | struct page **ppages; | 775 | struct page **ppages; |
756 | int page_base; | 776 | int page_base; |
757 | 777 | ||
778 | /* The head iovec is redirected to the RPC reply message | ||
779 | * in the receive buffer, to avoid a memcopy. | ||
780 | */ | ||
781 | rqst->rq_rcv_buf.head[0].iov_base = srcp; | ||
782 | rqst->rq_private_buf.head[0].iov_base = srcp; | ||
783 | |||
784 | /* The contents of the receive buffer that follow | ||
785 | * head.iov_len bytes are copied into the page list. | ||
786 | */ | ||
758 | curlen = rqst->rq_rcv_buf.head[0].iov_len; | 787 | curlen = rqst->rq_rcv_buf.head[0].iov_len; |
759 | if (curlen > copy_len) { /* write chunk header fixup */ | 788 | if (curlen > copy_len) |
760 | curlen = copy_len; | 789 | curlen = copy_len; |
761 | rqst->rq_rcv_buf.head[0].iov_len = curlen; | ||
762 | } | ||
763 | |||
764 | dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", | 790 | dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", |
765 | __func__, srcp, copy_len, curlen); | 791 | __func__, srcp, copy_len, curlen); |
766 | |||
767 | /* Shift pointer for first receive segment only */ | ||
768 | rqst->rq_rcv_buf.head[0].iov_base = srcp; | ||
769 | srcp += curlen; | 792 | srcp += curlen; |
770 | copy_len -= curlen; | 793 | copy_len -= curlen; |
771 | 794 | ||
772 | olen = copy_len; | ||
773 | i = 0; | ||
774 | rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; | ||
775 | page_base = rqst->rq_rcv_buf.page_base; | 795 | page_base = rqst->rq_rcv_buf.page_base; |
776 | ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); | 796 | ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); |
777 | page_base &= ~PAGE_MASK; | 797 | page_base &= ~PAGE_MASK; |
778 | 798 | fixup_copy_count = 0; | |
779 | if (copy_len && rqst->rq_rcv_buf.page_len) { | 799 | if (copy_len && rqst->rq_rcv_buf.page_len) { |
780 | npages = PAGE_ALIGN(page_base + | 800 | int pagelist_len; |
781 | rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; | 801 | |
782 | for (; i < npages; i++) { | 802 | pagelist_len = rqst->rq_rcv_buf.page_len; |
803 | if (pagelist_len > copy_len) | ||
804 | pagelist_len = copy_len; | ||
805 | npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; | ||
806 | for (i = 0; i < npages; i++) { | ||
783 | curlen = PAGE_SIZE - page_base; | 807 | curlen = PAGE_SIZE - page_base; |
784 | if (curlen > copy_len) | 808 | if (curlen > pagelist_len) |
785 | curlen = copy_len; | 809 | curlen = pagelist_len; |
810 | |||
786 | dprintk("RPC: %s: page %d" | 811 | dprintk("RPC: %s: page %d" |
787 | " srcp 0x%p len %d curlen %d\n", | 812 | " srcp 0x%p len %d curlen %d\n", |
788 | __func__, i, srcp, copy_len, curlen); | 813 | __func__, i, srcp, copy_len, curlen); |
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) | |||
792 | kunmap_atomic(destp); | 817 | kunmap_atomic(destp); |
793 | srcp += curlen; | 818 | srcp += curlen; |
794 | copy_len -= curlen; | 819 | copy_len -= curlen; |
795 | if (copy_len == 0) | 820 | fixup_copy_count += curlen; |
821 | pagelist_len -= curlen; | ||
822 | if (!pagelist_len) | ||
796 | break; | 823 | break; |
797 | page_base = 0; | 824 | page_base = 0; |
798 | } | 825 | } |
799 | } | ||
800 | 826 | ||
801 | if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { | 827 | /* Implicit padding for the last segment in a Write |
802 | curlen = copy_len; | 828 | * chunk is inserted inline at the front of the tail |
803 | if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) | 829 | * iovec. The upper layer ignores the content of |
804 | curlen = rqst->rq_rcv_buf.tail[0].iov_len; | 830 | * the pad. Simply ensure inline content in the tail |
805 | if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) | 831 | * that follows the Write chunk is properly aligned. |
806 | memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); | 832 | */ |
807 | dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", | 833 | if (pad) |
808 | __func__, srcp, copy_len, curlen); | 834 | srcp -= pad; |
809 | rqst->rq_rcv_buf.tail[0].iov_len = curlen; | ||
810 | copy_len -= curlen; ++i; | ||
811 | } else | ||
812 | rqst->rq_rcv_buf.tail[0].iov_len = 0; | ||
813 | |||
814 | if (pad) { | ||
815 | /* implicit padding on terminal chunk */ | ||
816 | unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; | ||
817 | while (pad--) | ||
818 | p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; | ||
819 | } | 835 | } |
820 | 836 | ||
821 | if (copy_len) | 837 | /* The tail iovec is redirected to the remaining data |
822 | dprintk("RPC: %s: %d bytes in" | 838 | * in the receive buffer, to avoid a memcopy. |
823 | " %d extra segments (%d lost)\n", | 839 | */ |
824 | __func__, olen, i, copy_len); | 840 | if (copy_len || pad) { |
841 | rqst->rq_rcv_buf.tail[0].iov_base = srcp; | ||
842 | rqst->rq_private_buf.tail[0].iov_base = srcp; | ||
843 | } | ||
825 | 844 | ||
826 | /* TBD avoid a warning from call_decode() */ | 845 | return fixup_copy_count; |
827 | rqst->rq_private_buf = rqst->rq_rcv_buf; | ||
828 | } | 846 | } |
829 | 847 | ||
830 | void | 848 | void |
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
960 | (headerp->rm_body.rm_chunks[1] == xdr_zero && | 978 | (headerp->rm_body.rm_chunks[1] == xdr_zero && |
961 | headerp->rm_body.rm_chunks[2] != xdr_zero) || | 979 | headerp->rm_body.rm_chunks[2] != xdr_zero) || |
962 | (headerp->rm_body.rm_chunks[1] != xdr_zero && | 980 | (headerp->rm_body.rm_chunks[1] != xdr_zero && |
963 | req->rl_nchunks == 0)) | 981 | list_empty(&req->rl_registered))) |
964 | goto badheader; | 982 | goto badheader; |
965 | if (headerp->rm_body.rm_chunks[1] != xdr_zero) { | 983 | if (headerp->rm_body.rm_chunks[1] != xdr_zero) { |
966 | /* count any expected write chunks in read reply */ | 984 | /* count any expected write chunks in read reply */ |
967 | /* start at write chunk array count */ | 985 | /* start at write chunk array count */ |
968 | iptr = &headerp->rm_body.rm_chunks[2]; | 986 | iptr = &headerp->rm_body.rm_chunks[2]; |
969 | rdmalen = rpcrdma_count_chunks(rep, | 987 | rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); |
970 | req->rl_nchunks, 1, &iptr); | ||
971 | /* check for validity, and no reply chunk after */ | 988 | /* check for validity, and no reply chunk after */ |
972 | if (rdmalen < 0 || *iptr++ != xdr_zero) | 989 | if (rdmalen < 0 || *iptr++ != xdr_zero) |
973 | goto badheader; | 990 | goto badheader; |
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
988 | rep->rr_len -= RPCRDMA_HDRLEN_MIN; | 1005 | rep->rr_len -= RPCRDMA_HDRLEN_MIN; |
989 | status = rep->rr_len; | 1006 | status = rep->rr_len; |
990 | } | 1007 | } |
991 | /* Fix up the rpc results for upper layer */ | 1008 | |
992 | rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); | 1009 | r_xprt->rx_stats.fixup_copy_count += |
1010 | rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, | ||
1011 | rdmalen); | ||
993 | break; | 1012 | break; |
994 | 1013 | ||
995 | case rdma_nomsg: | 1014 | case rdma_nomsg: |
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
997 | if (headerp->rm_body.rm_chunks[0] != xdr_zero || | 1016 | if (headerp->rm_body.rm_chunks[0] != xdr_zero || |
998 | headerp->rm_body.rm_chunks[1] != xdr_zero || | 1017 | headerp->rm_body.rm_chunks[1] != xdr_zero || |
999 | headerp->rm_body.rm_chunks[2] != xdr_one || | 1018 | headerp->rm_body.rm_chunks[2] != xdr_one || |
1000 | req->rl_nchunks == 0) | 1019 | list_empty(&req->rl_registered)) |
1001 | goto badheader; | 1020 | goto badheader; |
1002 | iptr = (__be32 *)((unsigned char *)headerp + | 1021 | iptr = (__be32 *)((unsigned char *)headerp + |
1003 | RPCRDMA_HDRLEN_MIN); | 1022 | RPCRDMA_HDRLEN_MIN); |
1004 | rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); | 1023 | rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); |
1005 | if (rdmalen < 0) | 1024 | if (rdmalen < 0) |
1006 | goto badheader; | 1025 | goto badheader; |
1007 | r_xprt->rx_stats.total_rdma_reply += rdmalen; | 1026 | r_xprt->rx_stats.total_rdma_reply += rdmalen; |
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |||
1014 | 1033 | ||
1015 | badheader: | 1034 | badheader: |
1016 | default: | 1035 | default: |
1017 | dprintk("%s: invalid rpcrdma reply header (type %d):" | 1036 | dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", |
1018 | " chunks[012] == %d %d %d" | 1037 | rqst->rq_task->tk_pid, __func__, |
1019 | " expected chunks <= %d\n", | 1038 | be32_to_cpu(headerp->rm_type)); |
1020 | __func__, be32_to_cpu(headerp->rm_type), | ||
1021 | headerp->rm_body.rm_chunks[0], | ||
1022 | headerp->rm_body.rm_chunks[1], | ||
1023 | headerp->rm_body.rm_chunks[2], | ||
1024 | req->rl_nchunks); | ||
1025 | status = -EIO; | 1039 | status = -EIO; |
1026 | r_xprt->rx_stats.bad_reply_count++; | 1040 | r_xprt->rx_stats.bad_reply_count++; |
1027 | break; | 1041 | break; |
@@ -1035,7 +1049,7 @@ out: | |||
1035 | * control: waking the next RPC waits until this RPC has | 1049 | * control: waking the next RPC waits until this RPC has |
1036 | * relinquished all its Send Queue entries. | 1050 | * relinquished all its Send Queue entries. |
1037 | */ | 1051 | */ |
1038 | if (req->rl_nchunks) | 1052 | if (!list_empty(&req->rl_registered)) |
1039 | r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); | 1053 | r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); |
1040 | 1054 | ||
1041 | spin_lock_bh(&xprt->transport_lock); | 1055 | spin_lock_bh(&xprt->transport_lock); |
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 99d2e5b72726..81f0e879f019 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c | |||
@@ -558,7 +558,6 @@ out_sendbuf: | |||
558 | 558 | ||
559 | out_fail: | 559 | out_fail: |
560 | rpcrdma_buffer_put(req); | 560 | rpcrdma_buffer_put(req); |
561 | r_xprt->rx_stats.failed_marshal_count++; | ||
562 | return NULL; | 561 | return NULL; |
563 | } | 562 | } |
564 | 563 | ||
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer) | |||
590 | rpcrdma_buffer_put(req); | 589 | rpcrdma_buffer_put(req); |
591 | } | 590 | } |
592 | 591 | ||
593 | /* | 592 | /** |
593 | * xprt_rdma_send_request - marshal and send an RPC request | ||
594 | * @task: RPC task with an RPC message in rq_snd_buf | ||
595 | * | ||
596 | * Return values: | ||
597 | * 0: The request has been sent | ||
598 | * ENOTCONN: Caller needs to invoke connect logic then call again | ||
599 | * ENOBUFS: Call again later to send the request | ||
600 | * EIO: A permanent error occurred. The request was not sent, | ||
601 | * and don't try it again | ||
602 | * | ||
594 | * send_request invokes the meat of RPC RDMA. It must do the following: | 603 | * send_request invokes the meat of RPC RDMA. It must do the following: |
604 | * | ||
595 | * 1. Marshal the RPC request into an RPC RDMA request, which means | 605 | * 1. Marshal the RPC request into an RPC RDMA request, which means |
596 | * putting a header in front of data, and creating IOVs for RDMA | 606 | * putting a header in front of data, and creating IOVs for RDMA |
597 | * from those in the request. | 607 | * from those in the request. |
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer) | |||
600 | * the request (rpcrdma_ep_post). | 610 | * the request (rpcrdma_ep_post). |
601 | * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). | 611 | * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). |
602 | */ | 612 | */ |
603 | |||
604 | static int | 613 | static int |
605 | xprt_rdma_send_request(struct rpc_task *task) | 614 | xprt_rdma_send_request(struct rpc_task *task) |
606 | { | 615 | { |
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task) | |||
610 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); | 619 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
611 | int rc = 0; | 620 | int rc = 0; |
612 | 621 | ||
622 | /* On retransmit, remove any previously registered chunks */ | ||
623 | r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); | ||
624 | |||
613 | rc = rpcrdma_marshal_req(rqst); | 625 | rc = rpcrdma_marshal_req(rqst); |
614 | if (rc < 0) | 626 | if (rc < 0) |
615 | goto failed_marshal; | 627 | goto failed_marshal; |
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task) | |||
630 | return 0; | 642 | return 0; |
631 | 643 | ||
632 | failed_marshal: | 644 | failed_marshal: |
633 | r_xprt->rx_stats.failed_marshal_count++; | ||
634 | dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", | 645 | dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", |
635 | __func__, rc); | 646 | __func__, rc); |
636 | if (rc == -EIO) | 647 | if (rc == -EIO) |
637 | return -EIO; | 648 | r_xprt->rx_stats.failed_marshal_count++; |
649 | if (rc != -ENOTCONN) | ||
650 | return rc; | ||
638 | drop_connection: | 651 | drop_connection: |
639 | xprt_disconnect_done(xprt); | 652 | xprt_disconnect_done(xprt); |
640 | return -ENOTCONN; /* implies disconnect */ | 653 | return -ENOTCONN; /* implies disconnect */ |
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) | |||
660 | xprt->stat.bad_xids, | 673 | xprt->stat.bad_xids, |
661 | xprt->stat.req_u, | 674 | xprt->stat.req_u, |
662 | xprt->stat.bklog_u); | 675 | xprt->stat.bklog_u); |
663 | seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", | 676 | seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ", |
664 | r_xprt->rx_stats.read_chunk_count, | 677 | r_xprt->rx_stats.read_chunk_count, |
665 | r_xprt->rx_stats.write_chunk_count, | 678 | r_xprt->rx_stats.write_chunk_count, |
666 | r_xprt->rx_stats.reply_chunk_count, | 679 | r_xprt->rx_stats.reply_chunk_count, |
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) | |||
672 | r_xprt->rx_stats.failed_marshal_count, | 685 | r_xprt->rx_stats.failed_marshal_count, |
673 | r_xprt->rx_stats.bad_reply_count, | 686 | r_xprt->rx_stats.bad_reply_count, |
674 | r_xprt->rx_stats.nomsg_call_count); | 687 | r_xprt->rx_stats.nomsg_call_count); |
688 | seq_printf(seq, "%lu %lu %lu\n", | ||
689 | r_xprt->rx_stats.mrs_recovered, | ||
690 | r_xprt->rx_stats.mrs_orphaned, | ||
691 | r_xprt->rx_stats.mrs_allocated); | ||
675 | } | 692 | } |
676 | 693 | ||
677 | static int | 694 | static int |
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void) | |||
741 | __func__, rc); | 758 | __func__, rc); |
742 | 759 | ||
743 | rpcrdma_destroy_wq(); | 760 | rpcrdma_destroy_wq(); |
744 | frwr_destroy_recovery_wq(); | ||
745 | 761 | ||
746 | rc = xprt_unregister_transport(&xprt_rdma_bc); | 762 | rc = xprt_unregister_transport(&xprt_rdma_bc); |
747 | if (rc) | 763 | if (rc) |
@@ -753,20 +769,13 @@ int xprt_rdma_init(void) | |||
753 | { | 769 | { |
754 | int rc; | 770 | int rc; |
755 | 771 | ||
756 | rc = frwr_alloc_recovery_wq(); | ||
757 | if (rc) | ||
758 | return rc; | ||
759 | |||
760 | rc = rpcrdma_alloc_wq(); | 772 | rc = rpcrdma_alloc_wq(); |
761 | if (rc) { | 773 | if (rc) |
762 | frwr_destroy_recovery_wq(); | ||
763 | return rc; | 774 | return rc; |
764 | } | ||
765 | 775 | ||
766 | rc = xprt_register_transport(&xprt_rdma); | 776 | rc = xprt_register_transport(&xprt_rdma); |
767 | if (rc) { | 777 | if (rc) { |
768 | rpcrdma_destroy_wq(); | 778 | rpcrdma_destroy_wq(); |
769 | frwr_destroy_recovery_wq(); | ||
770 | return rc; | 779 | return rc; |
771 | } | 780 | } |
772 | 781 | ||
@@ -774,7 +783,6 @@ int xprt_rdma_init(void) | |||
774 | if (rc) { | 783 | if (rc) { |
775 | xprt_unregister_transport(&xprt_rdma); | 784 | xprt_unregister_transport(&xprt_rdma); |
776 | rpcrdma_destroy_wq(); | 785 | rpcrdma_destroy_wq(); |
777 | frwr_destroy_recovery_wq(); | ||
778 | return rc; | 786 | return rc; |
779 | } | 787 | } |
780 | 788 | ||
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b044d98a1370..536d0be3f61b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c | |||
@@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) | |||
379 | struct rpcrdma_ia *ia = &xprt->rx_ia; | 379 | struct rpcrdma_ia *ia = &xprt->rx_ia; |
380 | int rc; | 380 | int rc; |
381 | 381 | ||
382 | ia->ri_dma_mr = NULL; | ||
383 | |||
384 | ia->ri_id = rpcrdma_create_id(xprt, ia, addr); | 382 | ia->ri_id = rpcrdma_create_id(xprt, ia, addr); |
385 | if (IS_ERR(ia->ri_id)) { | 383 | if (IS_ERR(ia->ri_id)) { |
386 | rc = PTR_ERR(ia->ri_id); | 384 | rc = PTR_ERR(ia->ri_id); |
@@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) | |||
391 | ia->ri_pd = ib_alloc_pd(ia->ri_device); | 389 | ia->ri_pd = ib_alloc_pd(ia->ri_device); |
392 | if (IS_ERR(ia->ri_pd)) { | 390 | if (IS_ERR(ia->ri_pd)) { |
393 | rc = PTR_ERR(ia->ri_pd); | 391 | rc = PTR_ERR(ia->ri_pd); |
394 | dprintk("RPC: %s: ib_alloc_pd() failed %i\n", | 392 | pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); |
395 | __func__, rc); | ||
396 | goto out2; | 393 | goto out2; |
397 | } | 394 | } |
398 | 395 | ||
399 | if (memreg == RPCRDMA_FRMR) { | ||
400 | if (!(ia->ri_device->attrs.device_cap_flags & | ||
401 | IB_DEVICE_MEM_MGT_EXTENSIONS) || | ||
402 | (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { | ||
403 | dprintk("RPC: %s: FRMR registration " | ||
404 | "not supported by HCA\n", __func__); | ||
405 | memreg = RPCRDMA_MTHCAFMR; | ||
406 | } | ||
407 | } | ||
408 | if (memreg == RPCRDMA_MTHCAFMR) { | ||
409 | if (!ia->ri_device->alloc_fmr) { | ||
410 | dprintk("RPC: %s: MTHCAFMR registration " | ||
411 | "not supported by HCA\n", __func__); | ||
412 | rc = -EINVAL; | ||
413 | goto out3; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | switch (memreg) { | 396 | switch (memreg) { |
418 | case RPCRDMA_FRMR: | 397 | case RPCRDMA_FRMR: |
419 | ia->ri_ops = &rpcrdma_frwr_memreg_ops; | 398 | if (frwr_is_supported(ia)) { |
420 | break; | 399 | ia->ri_ops = &rpcrdma_frwr_memreg_ops; |
421 | case RPCRDMA_ALLPHYSICAL: | 400 | break; |
422 | ia->ri_ops = &rpcrdma_physical_memreg_ops; | 401 | } |
423 | break; | 402 | /*FALLTHROUGH*/ |
424 | case RPCRDMA_MTHCAFMR: | 403 | case RPCRDMA_MTHCAFMR: |
425 | ia->ri_ops = &rpcrdma_fmr_memreg_ops; | 404 | if (fmr_is_supported(ia)) { |
426 | break; | 405 | ia->ri_ops = &rpcrdma_fmr_memreg_ops; |
406 | break; | ||
407 | } | ||
408 | /*FALLTHROUGH*/ | ||
427 | default: | 409 | default: |
428 | printk(KERN_ERR "RPC: Unsupported memory " | 410 | pr_err("rpcrdma: Unsupported memory registration mode: %d\n", |
429 | "registration mode: %d\n", memreg); | 411 | memreg); |
430 | rc = -ENOMEM; | 412 | rc = -EINVAL; |
431 | goto out3; | 413 | goto out3; |
432 | } | 414 | } |
433 | dprintk("RPC: %s: memory registration strategy is '%s'\n", | ||
434 | __func__, ia->ri_ops->ro_displayname); | ||
435 | 415 | ||
436 | return 0; | 416 | return 0; |
437 | 417 | ||
@@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, | |||
585 | out2: | 565 | out2: |
586 | ib_free_cq(sendcq); | 566 | ib_free_cq(sendcq); |
587 | out1: | 567 | out1: |
588 | if (ia->ri_dma_mr) | ||
589 | ib_dereg_mr(ia->ri_dma_mr); | ||
590 | return rc; | 568 | return rc; |
591 | } | 569 | } |
592 | 570 | ||
@@ -600,8 +578,6 @@ out1: | |||
600 | void | 578 | void |
601 | rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | 579 | rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) |
602 | { | 580 | { |
603 | int rc; | ||
604 | |||
605 | dprintk("RPC: %s: entering, connected is %d\n", | 581 | dprintk("RPC: %s: entering, connected is %d\n", |
606 | __func__, ep->rep_connected); | 582 | __func__, ep->rep_connected); |
607 | 583 | ||
@@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
615 | 591 | ||
616 | ib_free_cq(ep->rep_attr.recv_cq); | 592 | ib_free_cq(ep->rep_attr.recv_cq); |
617 | ib_free_cq(ep->rep_attr.send_cq); | 593 | ib_free_cq(ep->rep_attr.send_cq); |
618 | |||
619 | if (ia->ri_dma_mr) { | ||
620 | rc = ib_dereg_mr(ia->ri_dma_mr); | ||
621 | dprintk("RPC: %s: ib_dereg_mr returned %i\n", | ||
622 | __func__, rc); | ||
623 | } | ||
624 | } | 594 | } |
625 | 595 | ||
626 | /* | 596 | /* |
@@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
777 | ib_drain_qp(ia->ri_id->qp); | 747 | ib_drain_qp(ia->ri_id->qp); |
778 | } | 748 | } |
779 | 749 | ||
750 | static void | ||
751 | rpcrdma_mr_recovery_worker(struct work_struct *work) | ||
752 | { | ||
753 | struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, | ||
754 | rb_recovery_worker.work); | ||
755 | struct rpcrdma_mw *mw; | ||
756 | |||
757 | spin_lock(&buf->rb_recovery_lock); | ||
758 | while (!list_empty(&buf->rb_stale_mrs)) { | ||
759 | mw = list_first_entry(&buf->rb_stale_mrs, | ||
760 | struct rpcrdma_mw, mw_list); | ||
761 | list_del_init(&mw->mw_list); | ||
762 | spin_unlock(&buf->rb_recovery_lock); | ||
763 | |||
764 | dprintk("RPC: %s: recovering MR %p\n", __func__, mw); | ||
765 | mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); | ||
766 | |||
767 | spin_lock(&buf->rb_recovery_lock); | ||
768 | } | ||
769 | spin_unlock(&buf->rb_recovery_lock); | ||
770 | } | ||
771 | |||
772 | void | ||
773 | rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) | ||
774 | { | ||
775 | struct rpcrdma_xprt *r_xprt = mw->mw_xprt; | ||
776 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
777 | |||
778 | spin_lock(&buf->rb_recovery_lock); | ||
779 | list_add(&mw->mw_list, &buf->rb_stale_mrs); | ||
780 | spin_unlock(&buf->rb_recovery_lock); | ||
781 | |||
782 | schedule_delayed_work(&buf->rb_recovery_worker, 0); | ||
783 | } | ||
784 | |||
785 | static void | ||
786 | rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) | ||
787 | { | ||
788 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | ||
789 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
790 | unsigned int count; | ||
791 | LIST_HEAD(free); | ||
792 | LIST_HEAD(all); | ||
793 | |||
794 | for (count = 0; count < 32; count++) { | ||
795 | struct rpcrdma_mw *mw; | ||
796 | int rc; | ||
797 | |||
798 | mw = kzalloc(sizeof(*mw), GFP_KERNEL); | ||
799 | if (!mw) | ||
800 | break; | ||
801 | |||
802 | rc = ia->ri_ops->ro_init_mr(ia, mw); | ||
803 | if (rc) { | ||
804 | kfree(mw); | ||
805 | break; | ||
806 | } | ||
807 | |||
808 | mw->mw_xprt = r_xprt; | ||
809 | |||
810 | list_add(&mw->mw_list, &free); | ||
811 | list_add(&mw->mw_all, &all); | ||
812 | } | ||
813 | |||
814 | spin_lock(&buf->rb_mwlock); | ||
815 | list_splice(&free, &buf->rb_mws); | ||
816 | list_splice(&all, &buf->rb_all); | ||
817 | r_xprt->rx_stats.mrs_allocated += count; | ||
818 | spin_unlock(&buf->rb_mwlock); | ||
819 | |||
820 | dprintk("RPC: %s: created %u MRs\n", __func__, count); | ||
821 | } | ||
822 | |||
823 | static void | ||
824 | rpcrdma_mr_refresh_worker(struct work_struct *work) | ||
825 | { | ||
826 | struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, | ||
827 | rb_refresh_worker.work); | ||
828 | struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, | ||
829 | rx_buf); | ||
830 | |||
831 | rpcrdma_create_mrs(r_xprt); | ||
832 | } | ||
833 | |||
780 | struct rpcrdma_req * | 834 | struct rpcrdma_req * |
781 | rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) | 835 | rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) |
782 | { | 836 | { |
@@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) | |||
793 | spin_unlock(&buffer->rb_reqslock); | 847 | spin_unlock(&buffer->rb_reqslock); |
794 | req->rl_cqe.done = rpcrdma_wc_send; | 848 | req->rl_cqe.done = rpcrdma_wc_send; |
795 | req->rl_buffer = &r_xprt->rx_buf; | 849 | req->rl_buffer = &r_xprt->rx_buf; |
850 | INIT_LIST_HEAD(&req->rl_registered); | ||
796 | return req; | 851 | return req; |
797 | } | 852 | } |
798 | 853 | ||
@@ -832,17 +887,23 @@ int | |||
832 | rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) | 887 | rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) |
833 | { | 888 | { |
834 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; | 889 | struct rpcrdma_buffer *buf = &r_xprt->rx_buf; |
835 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; | ||
836 | int i, rc; | 890 | int i, rc; |
837 | 891 | ||
838 | buf->rb_max_requests = r_xprt->rx_data.max_requests; | 892 | buf->rb_max_requests = r_xprt->rx_data.max_requests; |
839 | buf->rb_bc_srv_max_requests = 0; | 893 | buf->rb_bc_srv_max_requests = 0; |
840 | spin_lock_init(&buf->rb_lock); | ||
841 | atomic_set(&buf->rb_credits, 1); | 894 | atomic_set(&buf->rb_credits, 1); |
895 | spin_lock_init(&buf->rb_mwlock); | ||
896 | spin_lock_init(&buf->rb_lock); | ||
897 | spin_lock_init(&buf->rb_recovery_lock); | ||
898 | INIT_LIST_HEAD(&buf->rb_mws); | ||
899 | INIT_LIST_HEAD(&buf->rb_all); | ||
900 | INIT_LIST_HEAD(&buf->rb_stale_mrs); | ||
901 | INIT_DELAYED_WORK(&buf->rb_refresh_worker, | ||
902 | rpcrdma_mr_refresh_worker); | ||
903 | INIT_DELAYED_WORK(&buf->rb_recovery_worker, | ||
904 | rpcrdma_mr_recovery_worker); | ||
842 | 905 | ||
843 | rc = ia->ri_ops->ro_init(r_xprt); | 906 | rpcrdma_create_mrs(r_xprt); |
844 | if (rc) | ||
845 | goto out; | ||
846 | 907 | ||
847 | INIT_LIST_HEAD(&buf->rb_send_bufs); | 908 | INIT_LIST_HEAD(&buf->rb_send_bufs); |
848 | INIT_LIST_HEAD(&buf->rb_allreqs); | 909 | INIT_LIST_HEAD(&buf->rb_allreqs); |
@@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) | |||
862 | } | 923 | } |
863 | 924 | ||
864 | INIT_LIST_HEAD(&buf->rb_recv_bufs); | 925 | INIT_LIST_HEAD(&buf->rb_recv_bufs); |
865 | for (i = 0; i < buf->rb_max_requests + 2; i++) { | 926 | for (i = 0; i < buf->rb_max_requests; i++) { |
866 | struct rpcrdma_rep *rep; | 927 | struct rpcrdma_rep *rep; |
867 | 928 | ||
868 | rep = rpcrdma_create_rep(r_xprt); | 929 | rep = rpcrdma_create_rep(r_xprt); |
@@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) | |||
918 | kfree(req); | 979 | kfree(req); |
919 | } | 980 | } |
920 | 981 | ||
982 | static void | ||
983 | rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) | ||
984 | { | ||
985 | struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, | ||
986 | rx_buf); | ||
987 | struct rpcrdma_ia *ia = rdmab_to_ia(buf); | ||
988 | struct rpcrdma_mw *mw; | ||
989 | unsigned int count; | ||
990 | |||
991 | count = 0; | ||
992 | spin_lock(&buf->rb_mwlock); | ||
993 | while (!list_empty(&buf->rb_all)) { | ||
994 | mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); | ||
995 | list_del(&mw->mw_all); | ||
996 | |||
997 | spin_unlock(&buf->rb_mwlock); | ||
998 | ia->ri_ops->ro_release_mr(mw); | ||
999 | count++; | ||
1000 | spin_lock(&buf->rb_mwlock); | ||
1001 | } | ||
1002 | spin_unlock(&buf->rb_mwlock); | ||
1003 | r_xprt->rx_stats.mrs_allocated = 0; | ||
1004 | |||
1005 | dprintk("RPC: %s: released %u MRs\n", __func__, count); | ||
1006 | } | ||
1007 | |||
921 | void | 1008 | void |
922 | rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) | 1009 | rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) |
923 | { | 1010 | { |
924 | struct rpcrdma_ia *ia = rdmab_to_ia(buf); | 1011 | struct rpcrdma_ia *ia = rdmab_to_ia(buf); |
925 | 1012 | ||
1013 | cancel_delayed_work_sync(&buf->rb_recovery_worker); | ||
1014 | |||
926 | while (!list_empty(&buf->rb_recv_bufs)) { | 1015 | while (!list_empty(&buf->rb_recv_bufs)) { |
927 | struct rpcrdma_rep *rep; | 1016 | struct rpcrdma_rep *rep; |
928 | 1017 | ||
@@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) | |||
944 | } | 1033 | } |
945 | spin_unlock(&buf->rb_reqslock); | 1034 | spin_unlock(&buf->rb_reqslock); |
946 | 1035 | ||
947 | ia->ri_ops->ro_destroy(buf); | 1036 | rpcrdma_destroy_mrs(buf); |
948 | } | 1037 | } |
949 | 1038 | ||
950 | struct rpcrdma_mw * | 1039 | struct rpcrdma_mw * |
@@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) | |||
962 | spin_unlock(&buf->rb_mwlock); | 1051 | spin_unlock(&buf->rb_mwlock); |
963 | 1052 | ||
964 | if (!mw) | 1053 | if (!mw) |
965 | pr_err("RPC: %s: no MWs available\n", __func__); | 1054 | goto out_nomws; |
966 | return mw; | 1055 | return mw; |
1056 | |||
1057 | out_nomws: | ||
1058 | dprintk("RPC: %s: no MWs available\n", __func__); | ||
1059 | schedule_delayed_work(&buf->rb_refresh_worker, 0); | ||
1060 | |||
1061 | /* Allow the reply handler and refresh worker to run */ | ||
1062 | cond_resched(); | ||
1063 | |||
1064 | return NULL; | ||
967 | } | 1065 | } |
968 | 1066 | ||
969 | void | 1067 | void |
@@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) | |||
978 | 1076 | ||
979 | /* | 1077 | /* |
980 | * Get a set of request/reply buffers. | 1078 | * Get a set of request/reply buffers. |
981 | * | ||
982 | * Reply buffer (if available) is attached to send buffer upon return. | ||
983 | */ | 1079 | */ |
984 | struct rpcrdma_req * | 1080 | struct rpcrdma_req * |
985 | rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) | 1081 | rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) |
@@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) | |||
998 | 1094 | ||
999 | out_reqbuf: | 1095 | out_reqbuf: |
1000 | spin_unlock(&buffers->rb_lock); | 1096 | spin_unlock(&buffers->rb_lock); |
1001 | pr_warn("RPC: %s: out of request buffers\n", __func__); | 1097 | pr_warn("rpcrdma: out of request buffers (%p)\n", buffers); |
1002 | return NULL; | 1098 | return NULL; |
1003 | out_repbuf: | 1099 | out_repbuf: |
1100 | list_add(&req->rl_free, &buffers->rb_send_bufs); | ||
1004 | spin_unlock(&buffers->rb_lock); | 1101 | spin_unlock(&buffers->rb_lock); |
1005 | pr_warn("RPC: %s: out of reply buffers\n", __func__); | 1102 | pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers); |
1006 | req->rl_reply = NULL; | 1103 | return NULL; |
1007 | return req; | ||
1008 | } | 1104 | } |
1009 | 1105 | ||
1010 | /* | 1106 | /* |
@@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) | |||
1060 | * Wrappers for internal-use kmalloc memory registration, used by buffer code. | 1156 | * Wrappers for internal-use kmalloc memory registration, used by buffer code. |
1061 | */ | 1157 | */ |
1062 | 1158 | ||
1063 | void | ||
1064 | rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) | ||
1065 | { | ||
1066 | dprintk("RPC: map_one: offset %p iova %llx len %zu\n", | ||
1067 | seg->mr_offset, | ||
1068 | (unsigned long long)seg->mr_dma, seg->mr_dmalen); | ||
1069 | } | ||
1070 | |||
1071 | /** | 1159 | /** |
1072 | * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers | 1160 | * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers |
1073 | * @ia: controlling rpcrdma_ia | 1161 | * @ia: controlling rpcrdma_ia |
@@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, | |||
1150 | if (rep) { | 1238 | if (rep) { |
1151 | rc = rpcrdma_ep_post_recv(ia, ep, rep); | 1239 | rc = rpcrdma_ep_post_recv(ia, ep, rep); |
1152 | if (rc) | 1240 | if (rc) |
1153 | goto out; | 1241 | return rc; |
1154 | req->rl_reply = NULL; | 1242 | req->rl_reply = NULL; |
1155 | } | 1243 | } |
1156 | 1244 | ||
@@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, | |||
1175 | 1263 | ||
1176 | rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); | 1264 | rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); |
1177 | if (rc) | 1265 | if (rc) |
1178 | dprintk("RPC: %s: ib_post_send returned %i\n", __func__, | 1266 | goto out_postsend_err; |
1179 | rc); | 1267 | return 0; |
1180 | out: | 1268 | |
1181 | return rc; | 1269 | out_postsend_err: |
1270 | pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); | ||
1271 | return -ENOTCONN; | ||
1182 | } | 1272 | } |
1183 | 1273 | ||
1184 | /* | 1274 | /* |
@@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, | |||
1203 | DMA_BIDIRECTIONAL); | 1293 | DMA_BIDIRECTIONAL); |
1204 | 1294 | ||
1205 | rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); | 1295 | rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); |
1206 | |||
1207 | if (rc) | 1296 | if (rc) |
1208 | dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, | 1297 | goto out_postrecv; |
1209 | rc); | 1298 | return 0; |
1210 | return rc; | 1299 | |
1300 | out_postrecv: | ||
1301 | pr_err("rpcrdma: ib_post_recv returned %i\n", rc); | ||
1302 | return -ENOTCONN; | ||
1211 | } | 1303 | } |
1212 | 1304 | ||
1213 | /** | 1305 | /** |
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 95cdc66225ee..670fad57153a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h | |||
@@ -68,7 +68,6 @@ struct rpcrdma_ia { | |||
68 | struct ib_device *ri_device; | 68 | struct ib_device *ri_device; |
69 | struct rdma_cm_id *ri_id; | 69 | struct rdma_cm_id *ri_id; |
70 | struct ib_pd *ri_pd; | 70 | struct ib_pd *ri_pd; |
71 | struct ib_mr *ri_dma_mr; | ||
72 | struct completion ri_done; | 71 | struct completion ri_done; |
73 | int ri_async_rc; | 72 | int ri_async_rc; |
74 | unsigned int ri_max_frmr_depth; | 73 | unsigned int ri_max_frmr_depth; |
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) | |||
172 | * o recv buffer (posted to provider) | 171 | * o recv buffer (posted to provider) |
173 | * o ib_sge (also donated to provider) | 172 | * o ib_sge (also donated to provider) |
174 | * o status of reply (length, success or not) | 173 | * o status of reply (length, success or not) |
175 | * o bookkeeping state to get run by tasklet (list, etc) | 174 | * o bookkeeping state to get run by reply handler (list, etc) |
176 | * | 175 | * |
177 | * These are allocated during initialization, per-transport instance; | 176 | * These are allocated during initialization, per-transport instance. |
178 | * however, the tasklet execution list itself is global, as it should | ||
179 | * always be pretty short. | ||
180 | * | 177 | * |
181 | * N of these are associated with a transport instance, and stored in | 178 | * N of these are associated with a transport instance, and stored in |
182 | * struct rpcrdma_buffer. N is the max number of outstanding requests. | 179 | * struct rpcrdma_buffer. N is the max number of outstanding requests. |
183 | */ | 180 | */ |
184 | 181 | ||
185 | #define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) | ||
186 | |||
187 | /* data segments + head/tail for Call + head/tail for Reply */ | ||
188 | #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) | ||
189 | |||
190 | struct rpcrdma_buffer; | ||
191 | |||
192 | struct rpcrdma_rep { | 182 | struct rpcrdma_rep { |
193 | struct ib_cqe rr_cqe; | 183 | struct ib_cqe rr_cqe; |
194 | unsigned int rr_len; | 184 | unsigned int rr_len; |
@@ -221,9 +211,6 @@ enum rpcrdma_frmr_state { | |||
221 | }; | 211 | }; |
222 | 212 | ||
223 | struct rpcrdma_frmr { | 213 | struct rpcrdma_frmr { |
224 | struct scatterlist *fr_sg; | ||
225 | int fr_nents; | ||
226 | enum dma_data_direction fr_dir; | ||
227 | struct ib_mr *fr_mr; | 214 | struct ib_mr *fr_mr; |
228 | struct ib_cqe fr_cqe; | 215 | struct ib_cqe fr_cqe; |
229 | enum rpcrdma_frmr_state fr_state; | 216 | enum rpcrdma_frmr_state fr_state; |
@@ -235,18 +222,23 @@ struct rpcrdma_frmr { | |||
235 | }; | 222 | }; |
236 | 223 | ||
237 | struct rpcrdma_fmr { | 224 | struct rpcrdma_fmr { |
238 | struct ib_fmr *fmr; | 225 | struct ib_fmr *fm_mr; |
239 | u64 *physaddrs; | 226 | u64 *fm_physaddrs; |
240 | }; | 227 | }; |
241 | 228 | ||
242 | struct rpcrdma_mw { | 229 | struct rpcrdma_mw { |
230 | struct list_head mw_list; | ||
231 | struct scatterlist *mw_sg; | ||
232 | int mw_nents; | ||
233 | enum dma_data_direction mw_dir; | ||
243 | union { | 234 | union { |
244 | struct rpcrdma_fmr fmr; | 235 | struct rpcrdma_fmr fmr; |
245 | struct rpcrdma_frmr frmr; | 236 | struct rpcrdma_frmr frmr; |
246 | }; | 237 | }; |
247 | struct work_struct mw_work; | ||
248 | struct rpcrdma_xprt *mw_xprt; | 238 | struct rpcrdma_xprt *mw_xprt; |
249 | struct list_head mw_list; | 239 | u32 mw_handle; |
240 | u32 mw_length; | ||
241 | u64 mw_offset; | ||
250 | struct list_head mw_all; | 242 | struct list_head mw_all; |
251 | }; | 243 | }; |
252 | 244 | ||
@@ -266,33 +258,30 @@ struct rpcrdma_mw { | |||
266 | * of iovs for send operations. The reason is that the iovs passed to | 258 | * of iovs for send operations. The reason is that the iovs passed to |
267 | * ib_post_{send,recv} must not be modified until the work request | 259 | * ib_post_{send,recv} must not be modified until the work request |
268 | * completes. | 260 | * completes. |
269 | * | ||
270 | * NOTES: | ||
271 | * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we | ||
272 | * marshal. The number needed varies depending on the iov lists that | ||
273 | * are passed to us, the memory registration mode we are in, and if | ||
274 | * physical addressing is used, the layout. | ||
275 | */ | 261 | */ |
276 | 262 | ||
263 | /* Maximum number of page-sized "segments" per chunk list to be | ||
264 | * registered or invalidated. Must handle a Reply chunk: | ||
265 | */ | ||
266 | enum { | ||
267 | RPCRDMA_MAX_IOV_SEGS = 3, | ||
268 | RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, | ||
269 | RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + | ||
270 | RPCRDMA_MAX_IOV_SEGS, | ||
271 | }; | ||
272 | |||
277 | struct rpcrdma_mr_seg { /* chunk descriptors */ | 273 | struct rpcrdma_mr_seg { /* chunk descriptors */ |
278 | struct rpcrdma_mw *rl_mw; /* registered MR */ | ||
279 | u64 mr_base; /* registration result */ | ||
280 | u32 mr_rkey; /* registration result */ | ||
281 | u32 mr_len; /* length of chunk or segment */ | 274 | u32 mr_len; /* length of chunk or segment */ |
282 | int mr_nsegs; /* number of segments in chunk or 0 */ | ||
283 | enum dma_data_direction mr_dir; /* segment mapping direction */ | ||
284 | dma_addr_t mr_dma; /* segment mapping address */ | ||
285 | size_t mr_dmalen; /* segment mapping length */ | ||
286 | struct page *mr_page; /* owning page, if any */ | 275 | struct page *mr_page; /* owning page, if any */ |
287 | char *mr_offset; /* kva if no page, else offset */ | 276 | char *mr_offset; /* kva if no page, else offset */ |
288 | }; | 277 | }; |
289 | 278 | ||
290 | #define RPCRDMA_MAX_IOVS (2) | 279 | #define RPCRDMA_MAX_IOVS (2) |
291 | 280 | ||
281 | struct rpcrdma_buffer; | ||
292 | struct rpcrdma_req { | 282 | struct rpcrdma_req { |
293 | struct list_head rl_free; | 283 | struct list_head rl_free; |
294 | unsigned int rl_niovs; | 284 | unsigned int rl_niovs; |
295 | unsigned int rl_nchunks; | ||
296 | unsigned int rl_connect_cookie; | 285 | unsigned int rl_connect_cookie; |
297 | struct rpc_task *rl_task; | 286 | struct rpc_task *rl_task; |
298 | struct rpcrdma_buffer *rl_buffer; | 287 | struct rpcrdma_buffer *rl_buffer; |
@@ -300,12 +289,13 @@ struct rpcrdma_req { | |||
300 | struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; | 289 | struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; |
301 | struct rpcrdma_regbuf *rl_rdmabuf; | 290 | struct rpcrdma_regbuf *rl_rdmabuf; |
302 | struct rpcrdma_regbuf *rl_sendbuf; | 291 | struct rpcrdma_regbuf *rl_sendbuf; |
303 | struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; | ||
304 | struct rpcrdma_mr_seg *rl_nextseg; | ||
305 | 292 | ||
306 | struct ib_cqe rl_cqe; | 293 | struct ib_cqe rl_cqe; |
307 | struct list_head rl_all; | 294 | struct list_head rl_all; |
308 | bool rl_backchannel; | 295 | bool rl_backchannel; |
296 | |||
297 | struct list_head rl_registered; /* registered segments */ | ||
298 | struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; | ||
309 | }; | 299 | }; |
310 | 300 | ||
311 | static inline struct rpcrdma_req * | 301 | static inline struct rpcrdma_req * |
@@ -341,6 +331,11 @@ struct rpcrdma_buffer { | |||
341 | struct list_head rb_allreqs; | 331 | struct list_head rb_allreqs; |
342 | 332 | ||
343 | u32 rb_bc_max_requests; | 333 | u32 rb_bc_max_requests; |
334 | |||
335 | spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ | ||
336 | struct list_head rb_stale_mrs; | ||
337 | struct delayed_work rb_recovery_worker; | ||
338 | struct delayed_work rb_refresh_worker; | ||
344 | }; | 339 | }; |
345 | #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) | 340 | #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) |
346 | 341 | ||
@@ -387,6 +382,9 @@ struct rpcrdma_stats { | |||
387 | unsigned long bad_reply_count; | 382 | unsigned long bad_reply_count; |
388 | unsigned long nomsg_call_count; | 383 | unsigned long nomsg_call_count; |
389 | unsigned long bcall_count; | 384 | unsigned long bcall_count; |
385 | unsigned long mrs_recovered; | ||
386 | unsigned long mrs_orphaned; | ||
387 | unsigned long mrs_allocated; | ||
390 | }; | 388 | }; |
391 | 389 | ||
392 | /* | 390 | /* |
@@ -395,23 +393,25 @@ struct rpcrdma_stats { | |||
395 | struct rpcrdma_xprt; | 393 | struct rpcrdma_xprt; |
396 | struct rpcrdma_memreg_ops { | 394 | struct rpcrdma_memreg_ops { |
397 | int (*ro_map)(struct rpcrdma_xprt *, | 395 | int (*ro_map)(struct rpcrdma_xprt *, |
398 | struct rpcrdma_mr_seg *, int, bool); | 396 | struct rpcrdma_mr_seg *, int, bool, |
397 | struct rpcrdma_mw **); | ||
399 | void (*ro_unmap_sync)(struct rpcrdma_xprt *, | 398 | void (*ro_unmap_sync)(struct rpcrdma_xprt *, |
400 | struct rpcrdma_req *); | 399 | struct rpcrdma_req *); |
401 | void (*ro_unmap_safe)(struct rpcrdma_xprt *, | 400 | void (*ro_unmap_safe)(struct rpcrdma_xprt *, |
402 | struct rpcrdma_req *, bool); | 401 | struct rpcrdma_req *, bool); |
402 | void (*ro_recover_mr)(struct rpcrdma_mw *); | ||
403 | int (*ro_open)(struct rpcrdma_ia *, | 403 | int (*ro_open)(struct rpcrdma_ia *, |
404 | struct rpcrdma_ep *, | 404 | struct rpcrdma_ep *, |
405 | struct rpcrdma_create_data_internal *); | 405 | struct rpcrdma_create_data_internal *); |
406 | size_t (*ro_maxpages)(struct rpcrdma_xprt *); | 406 | size_t (*ro_maxpages)(struct rpcrdma_xprt *); |
407 | int (*ro_init)(struct rpcrdma_xprt *); | 407 | int (*ro_init_mr)(struct rpcrdma_ia *, |
408 | void (*ro_destroy)(struct rpcrdma_buffer *); | 408 | struct rpcrdma_mw *); |
409 | void (*ro_release_mr)(struct rpcrdma_mw *); | ||
409 | const char *ro_displayname; | 410 | const char *ro_displayname; |
410 | }; | 411 | }; |
411 | 412 | ||
412 | extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; | 413 | extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; |
413 | extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; | 414 | extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; |
414 | extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; | ||
415 | 415 | ||
416 | /* | 416 | /* |
417 | * RPCRDMA transport -- encapsulates the structures above for | 417 | * RPCRDMA transport -- encapsulates the structures above for |
@@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize; | |||
446 | */ | 446 | */ |
447 | int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); | 447 | int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); |
448 | void rpcrdma_ia_close(struct rpcrdma_ia *); | 448 | void rpcrdma_ia_close(struct rpcrdma_ia *); |
449 | bool frwr_is_supported(struct rpcrdma_ia *); | ||
450 | bool fmr_is_supported(struct rpcrdma_ia *); | ||
449 | 451 | ||
450 | /* | 452 | /* |
451 | * Endpoint calls - xprtrdma/verbs.c | 453 | * Endpoint calls - xprtrdma/verbs.c |
@@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); | |||
477 | void rpcrdma_recv_buffer_get(struct rpcrdma_req *); | 479 | void rpcrdma_recv_buffer_get(struct rpcrdma_req *); |
478 | void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); | 480 | void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); |
479 | 481 | ||
482 | void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); | ||
483 | |||
480 | struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, | 484 | struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, |
481 | size_t, gfp_t); | 485 | size_t, gfp_t); |
482 | void rpcrdma_free_regbuf(struct rpcrdma_ia *, | 486 | void rpcrdma_free_regbuf(struct rpcrdma_ia *, |
@@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, | |||
484 | 488 | ||
485 | int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); | 489 | int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); |
486 | 490 | ||
487 | int frwr_alloc_recovery_wq(void); | ||
488 | void frwr_destroy_recovery_wq(void); | ||
489 | |||
490 | int rpcrdma_alloc_wq(void); | 491 | int rpcrdma_alloc_wq(void); |
491 | void rpcrdma_destroy_wq(void); | 492 | void rpcrdma_destroy_wq(void); |
492 | 493 | ||
@@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void); | |||
494 | * Wrappers for chunk registration, shared by read/write chunk code. | 495 | * Wrappers for chunk registration, shared by read/write chunk code. |
495 | */ | 496 | */ |
496 | 497 | ||
497 | void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); | ||
498 | |||
499 | static inline enum dma_data_direction | 498 | static inline enum dma_data_direction |
500 | rpcrdma_data_dir(bool writing) | 499 | rpcrdma_data_dir(bool writing) |
501 | { | 500 | { |
502 | return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; | 501 | return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; |
503 | } | 502 | } |
504 | 503 | ||
505 | static inline void | ||
506 | rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, | ||
507 | enum dma_data_direction direction) | ||
508 | { | ||
509 | seg->mr_dir = direction; | ||
510 | seg->mr_dmalen = seg->mr_len; | ||
511 | |||
512 | if (seg->mr_page) | ||
513 | seg->mr_dma = ib_dma_map_page(device, | ||
514 | seg->mr_page, offset_in_page(seg->mr_offset), | ||
515 | seg->mr_dmalen, seg->mr_dir); | ||
516 | else | ||
517 | seg->mr_dma = ib_dma_map_single(device, | ||
518 | seg->mr_offset, | ||
519 | seg->mr_dmalen, seg->mr_dir); | ||
520 | |||
521 | if (ib_dma_mapping_error(device, seg->mr_dma)) | ||
522 | rpcrdma_mapping_error(seg); | ||
523 | } | ||
524 | |||
525 | static inline void | ||
526 | rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) | ||
527 | { | ||
528 | if (seg->mr_page) | ||
529 | ib_dma_unmap_page(device, | ||
530 | seg->mr_dma, seg->mr_dmalen, seg->mr_dir); | ||
531 | else | ||
532 | ib_dma_unmap_single(device, | ||
533 | seg->mr_dma, seg->mr_dmalen, seg->mr_dir); | ||
534 | } | ||
535 | |||
536 | /* | 504 | /* |
537 | * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c | 505 | * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c |
538 | */ | 506 | */ |
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e2b2fa189c3..111767ab124a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c | |||
@@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = { | |||
124 | .mode = 0644, | 124 | .mode = 0644, |
125 | .proc_handler = proc_dointvec_minmax, | 125 | .proc_handler = proc_dointvec_minmax, |
126 | .extra1 = &xprt_min_resvport_limit, | 126 | .extra1 = &xprt_min_resvport_limit, |
127 | .extra2 = &xprt_max_resvport_limit | 127 | .extra2 = &xprt_max_resvport |
128 | }, | 128 | }, |
129 | { | 129 | { |
130 | .procname = "max_resvport", | 130 | .procname = "max_resvport", |
@@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = { | |||
132 | .maxlen = sizeof(unsigned int), | 132 | .maxlen = sizeof(unsigned int), |
133 | .mode = 0644, | 133 | .mode = 0644, |
134 | .proc_handler = proc_dointvec_minmax, | 134 | .proc_handler = proc_dointvec_minmax, |
135 | .extra1 = &xprt_min_resvport_limit, | 135 | .extra1 = &xprt_min_resvport, |
136 | .extra2 = &xprt_max_resvport_limit | 136 | .extra2 = &xprt_max_resvport_limit |
137 | }, | 137 | }, |
138 | { | 138 | { |
@@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task) | |||
642 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | 642 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); |
643 | struct xdr_buf *xdr = &req->rq_snd_buf; | 643 | struct xdr_buf *xdr = &req->rq_snd_buf; |
644 | bool zerocopy = true; | 644 | bool zerocopy = true; |
645 | bool vm_wait = false; | ||
645 | int status; | 646 | int status; |
646 | int sent; | 647 | int sent; |
647 | 648 | ||
@@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task) | |||
677 | return 0; | 678 | return 0; |
678 | } | 679 | } |
679 | 680 | ||
681 | WARN_ON_ONCE(sent == 0 && status == 0); | ||
682 | |||
683 | if (status == -EAGAIN ) { | ||
684 | /* | ||
685 | * Return EAGAIN if we're sure we're hitting the | ||
686 | * socket send buffer limits. | ||
687 | */ | ||
688 | if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) | ||
689 | break; | ||
690 | /* | ||
691 | * Did we hit a memory allocation failure? | ||
692 | */ | ||
693 | if (sent == 0) { | ||
694 | status = -ENOBUFS; | ||
695 | if (vm_wait) | ||
696 | break; | ||
697 | /* Retry, knowing now that we're below the | ||
698 | * socket send buffer limit | ||
699 | */ | ||
700 | vm_wait = true; | ||
701 | } | ||
702 | continue; | ||
703 | } | ||
680 | if (status < 0) | 704 | if (status < 0) |
681 | break; | 705 | break; |
682 | if (sent == 0) { | 706 | vm_wait = false; |
683 | status = -EAGAIN; | ||
684 | break; | ||
685 | } | ||
686 | } | 707 | } |
687 | if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) | ||
688 | status = -ENOBUFS; | ||
689 | 708 | ||
690 | switch (status) { | 709 | switch (status) { |
691 | case -ENOTSOCK: | 710 | case -ENOTSOCK: |
@@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s | |||
755 | sk->sk_error_report = transport->old_error_report; | 774 | sk->sk_error_report = transport->old_error_report; |
756 | } | 775 | } |
757 | 776 | ||
777 | static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) | ||
778 | { | ||
779 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); | ||
780 | |||
781 | clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); | ||
782 | } | ||
783 | |||
758 | static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) | 784 | static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) |
759 | { | 785 | { |
760 | smp_mb__before_atomic(); | 786 | smp_mb__before_atomic(); |
761 | clear_bit(XPRT_CLOSE_WAIT, &xprt->state); | 787 | clear_bit(XPRT_CLOSE_WAIT, &xprt->state); |
762 | clear_bit(XPRT_CLOSING, &xprt->state); | 788 | clear_bit(XPRT_CLOSING, &xprt->state); |
789 | xs_sock_reset_state_flags(xprt); | ||
763 | smp_mb__after_atomic(); | 790 | smp_mb__after_atomic(); |
764 | } | 791 | } |
765 | 792 | ||
@@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport) | |||
962 | goto out; | 989 | goto out; |
963 | for (;;) { | 990 | for (;;) { |
964 | skb = skb_recv_datagram(sk, 0, 1, &err); | 991 | skb = skb_recv_datagram(sk, 0, 1, &err); |
965 | if (skb == NULL) | 992 | if (skb != NULL) { |
993 | xs_local_data_read_skb(&transport->xprt, sk, skb); | ||
994 | skb_free_datagram(sk, skb); | ||
995 | continue; | ||
996 | } | ||
997 | if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) | ||
966 | break; | 998 | break; |
967 | xs_local_data_read_skb(&transport->xprt, sk, skb); | ||
968 | skb_free_datagram(sk, skb); | ||
969 | } | 999 | } |
970 | out: | 1000 | out: |
971 | mutex_unlock(&transport->recv_mutex); | 1001 | mutex_unlock(&transport->recv_mutex); |
@@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport) | |||
1043 | goto out; | 1073 | goto out; |
1044 | for (;;) { | 1074 | for (;;) { |
1045 | skb = skb_recv_datagram(sk, 0, 1, &err); | 1075 | skb = skb_recv_datagram(sk, 0, 1, &err); |
1046 | if (skb == NULL) | 1076 | if (skb != NULL) { |
1077 | xs_udp_data_read_skb(&transport->xprt, sk, skb); | ||
1078 | skb_free_datagram(sk, skb); | ||
1079 | continue; | ||
1080 | } | ||
1081 | if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) | ||
1047 | break; | 1082 | break; |
1048 | xs_udp_data_read_skb(&transport->xprt, sk, skb); | ||
1049 | skb_free_datagram(sk, skb); | ||
1050 | } | 1083 | } |
1051 | out: | 1084 | out: |
1052 | mutex_unlock(&transport->recv_mutex); | 1085 | mutex_unlock(&transport->recv_mutex); |
@@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk) | |||
1074 | if (xprt != NULL) { | 1107 | if (xprt != NULL) { |
1075 | struct sock_xprt *transport = container_of(xprt, | 1108 | struct sock_xprt *transport = container_of(xprt, |
1076 | struct sock_xprt, xprt); | 1109 | struct sock_xprt, xprt); |
1077 | queue_work(rpciod_workqueue, &transport->recv_worker); | 1110 | transport->old_data_ready(sk); |
1111 | /* Any data means we had a useful conversation, so | ||
1112 | * then we don't need to delay the next reconnect | ||
1113 | */ | ||
1114 | if (xprt->reestablish_timeout) | ||
1115 | xprt->reestablish_timeout = 0; | ||
1116 | if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) | ||
1117 | queue_work(xprtiod_workqueue, &transport->recv_worker); | ||
1078 | } | 1118 | } |
1079 | read_unlock_bh(&sk->sk_callback_lock); | 1119 | read_unlock_bh(&sk->sk_callback_lock); |
1080 | } | 1120 | } |
@@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) | |||
1474 | for (;;) { | 1514 | for (;;) { |
1475 | lock_sock(sk); | 1515 | lock_sock(sk); |
1476 | read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); | 1516 | read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); |
1477 | release_sock(sk); | 1517 | if (read <= 0) { |
1478 | if (read <= 0) | 1518 | clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); |
1479 | break; | 1519 | release_sock(sk); |
1480 | total += read; | 1520 | if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) |
1521 | break; | ||
1522 | } else { | ||
1523 | release_sock(sk); | ||
1524 | total += read; | ||
1525 | } | ||
1481 | rd_desc.count = 65536; | 1526 | rd_desc.count = 65536; |
1482 | } | 1527 | } |
1483 | out: | 1528 | out: |
@@ -1493,34 +1538,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work) | |||
1493 | } | 1538 | } |
1494 | 1539 | ||
1495 | /** | 1540 | /** |
1496 | * xs_tcp_data_ready - "data ready" callback for TCP sockets | ||
1497 | * @sk: socket with data to read | ||
1498 | * | ||
1499 | */ | ||
1500 | static void xs_tcp_data_ready(struct sock *sk) | ||
1501 | { | ||
1502 | struct sock_xprt *transport; | ||
1503 | struct rpc_xprt *xprt; | ||
1504 | |||
1505 | dprintk("RPC: xs_tcp_data_ready...\n"); | ||
1506 | |||
1507 | read_lock_bh(&sk->sk_callback_lock); | ||
1508 | if (!(xprt = xprt_from_sock(sk))) | ||
1509 | goto out; | ||
1510 | transport = container_of(xprt, struct sock_xprt, xprt); | ||
1511 | |||
1512 | /* Any data means we had a useful conversation, so | ||
1513 | * the we don't need to delay the next reconnect | ||
1514 | */ | ||
1515 | if (xprt->reestablish_timeout) | ||
1516 | xprt->reestablish_timeout = 0; | ||
1517 | queue_work(rpciod_workqueue, &transport->recv_worker); | ||
1518 | |||
1519 | out: | ||
1520 | read_unlock_bh(&sk->sk_callback_lock); | ||
1521 | } | ||
1522 | |||
1523 | /** | ||
1524 | * xs_tcp_state_change - callback to handle TCP socket state changes | 1541 | * xs_tcp_state_change - callback to handle TCP socket state changes |
1525 | * @sk: socket whose state has changed | 1542 | * @sk: socket whose state has changed |
1526 | * | 1543 | * |
@@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) | |||
1714 | 1731 | ||
1715 | static unsigned short xs_get_random_port(void) | 1732 | static unsigned short xs_get_random_port(void) |
1716 | { | 1733 | { |
1717 | unsigned short range = xprt_max_resvport - xprt_min_resvport; | 1734 | unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; |
1718 | unsigned short rand = (unsigned short) prandom_u32() % range; | 1735 | unsigned short rand = (unsigned short) prandom_u32() % range; |
1719 | return rand + xprt_min_resvport; | 1736 | return rand + xprt_min_resvport; |
1720 | } | 1737 | } |
@@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) | |||
2241 | xs_save_old_callbacks(transport, sk); | 2258 | xs_save_old_callbacks(transport, sk); |
2242 | 2259 | ||
2243 | sk->sk_user_data = xprt; | 2260 | sk->sk_user_data = xprt; |
2244 | sk->sk_data_ready = xs_tcp_data_ready; | 2261 | sk->sk_data_ready = xs_data_ready; |
2245 | sk->sk_state_change = xs_tcp_state_change; | 2262 | sk->sk_state_change = xs_tcp_state_change; |
2246 | sk->sk_write_space = xs_tcp_write_space; | 2263 | sk->sk_write_space = xs_tcp_write_space; |
2247 | sock_set_flag(sk, SOCK_FASYNC); | 2264 | sock_set_flag(sk, SOCK_FASYNC); |
@@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) | |||
2380 | /* Start by resetting any existing state */ | 2397 | /* Start by resetting any existing state */ |
2381 | xs_reset_transport(transport); | 2398 | xs_reset_transport(transport); |
2382 | 2399 | ||
2383 | queue_delayed_work(rpciod_workqueue, | 2400 | queue_delayed_work(xprtiod_workqueue, |
2384 | &transport->connect_worker, | 2401 | &transport->connect_worker, |
2385 | xprt->reestablish_timeout); | 2402 | xprt->reestablish_timeout); |
2386 | xprt->reestablish_timeout <<= 1; | 2403 | xprt->reestablish_timeout <<= 1; |
@@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) | |||
2390 | xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; | 2407 | xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; |
2391 | } else { | 2408 | } else { |
2392 | dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); | 2409 | dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); |
2393 | queue_delayed_work(rpciod_workqueue, | 2410 | queue_delayed_work(xprtiod_workqueue, |
2394 | &transport->connect_worker, 0); | 2411 | &transport->connect_worker, 0); |
2395 | } | 2412 | } |
2396 | } | 2413 | } |
@@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val, | |||
3153 | 3170 | ||
3154 | static int param_set_portnr(const char *val, const struct kernel_param *kp) | 3171 | static int param_set_portnr(const char *val, const struct kernel_param *kp) |
3155 | { | 3172 | { |
3156 | return param_set_uint_minmax(val, kp, | 3173 | if (kp->arg == &xprt_min_resvport) |
3174 | return param_set_uint_minmax(val, kp, | ||
3157 | RPC_MIN_RESVPORT, | 3175 | RPC_MIN_RESVPORT, |
3176 | xprt_max_resvport); | ||
3177 | return param_set_uint_minmax(val, kp, | ||
3178 | xprt_min_resvport, | ||
3158 | RPC_MAX_RESVPORT); | 3179 | RPC_MAX_RESVPORT); |
3159 | } | 3180 | } |
3160 | 3181 | ||