aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/Kconfig15
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/Makefile5
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1024
-rw-r--r--fs/nfs/blocklayout/blocklayout.h205
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c391
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c111
-rw-r--r--fs/nfs/blocklayout/extents.c935
-rw-r--r--fs/nfs/cache_lib.c9
-rw-r--r--fs/nfs/cache_lib.h2
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c84
-rw-r--r--fs/nfs/callback_xdr.c24
-rw-r--r--fs/nfs/client.c18
-rw-r--r--fs/nfs/delegation.c16
-rw-r--r--fs/nfs/dir.c146
-rw-r--r--fs/nfs/direct.c6
-rw-r--r--fs/nfs/file.c50
-rw-r--r--fs/nfs/idmap.c32
-rw-r--r--fs/nfs/inode.c22
-rw-r--r--fs/nfs/internal.h13
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3acl.c12
-rw-r--r--fs/nfs/nfs3proc.c7
-rw-r--r--fs/nfs/nfs4_fs.h25
-rw-r--r--fs/nfs/nfs4filelayout.c101
-rw-r--r--fs/nfs/nfs4filelayout.h17
-rw-r--r--fs/nfs/nfs4filelayoutdev.c452
-rw-r--r--fs/nfs/nfs4proc.c464
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c35
-rw-r--r--fs/nfs/nfs4xdr.c511
-rw-r--r--fs/nfs/objlayout/objio_osd.c48
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c3
-rw-r--r--fs/nfs/pagelist.c73
-rw-r--r--fs/nfs/pnfs.c359
-rw-r--r--fs/nfs/pnfs.h107
-rw-r--r--fs/nfs/pnfs_dev.c64
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c186
-rw-r--r--fs/nfs/super.c82
-rw-r--r--fs/nfs/unlink.c37
-rw-r--r--fs/nfs/write.c249
43 files changed, 5047 insertions, 911 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 81515545ba7..dbcd82126ae 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -77,6 +77,7 @@ config NFS_V4
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select SUNRPC_BACKCHANNEL
80 select PNFS_FILE_LAYOUT 81 select PNFS_FILE_LAYOUT
81 help 82 help
82 This option enables support for minor version 1 of the NFSv4 protocol 83 This option enables support for minor version 1 of the NFSv4 protocol
@@ -87,15 +88,15 @@ config NFS_V4_1
87config PNFS_FILE_LAYOUT 88config PNFS_FILE_LAYOUT
88 tristate 89 tristate
89 90
91config PNFS_BLOCK
92 tristate
93 depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM
94 default m
95
90config PNFS_OBJLAYOUT 96config PNFS_OBJLAYOUT
91 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" 97 tristate
92 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD 98 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
93 help 99 default m
94 Say M here if you want your pNFS client to support the Objects Layout Driver.
95 Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
96 upper level driver (SCSI_OSD_ULD).
97
98 If unsure, say N.
99 100
100config ROOT_NFS 101config ROOT_NFS
101 bool "Root file system on NFS" 102 bool "Root file system on NFS"
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6a34f7dd0e6..b58613d0abb 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
24 24
25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
26obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 00000000000..d5815505c02
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS block layout driver kernel module
3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 00000000000..281ae95932c
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1024 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/module.h>
34#include <linux/init.h>
35#include <linux/mount.h>
36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h>
40
41#include "blocklayout.h"
42
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44
45MODULE_LICENSE("GPL");
46MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
47MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
48
49struct dentry *bl_device_pipe;
50wait_queue_head_t bl_wq;
51
52static void print_page(struct page *page)
53{
54 dprintk("PRINTPAGE page %p\n", page);
55 dprintk(" PagePrivate %d\n", PagePrivate(page));
56 dprintk(" PageUptodate %d\n", PageUptodate(page));
57 dprintk(" PageError %d\n", PageError(page));
58 dprintk(" PageDirty %d\n", PageDirty(page));
59 dprintk(" PageReferenced %d\n", PageReferenced(page));
60 dprintk(" PageLocked %d\n", PageLocked(page));
61 dprintk(" PageWriteback %d\n", PageWriteback(page));
62 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
63 dprintk("\n");
64}
65
66/* Given the be associated with isect, determine if page data needs to be
67 * initialized.
68 */
69static int is_hole(struct pnfs_block_extent *be, sector_t isect)
70{
71 if (be->be_state == PNFS_BLOCK_NONE_DATA)
72 return 1;
73 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
74 return 0;
75 else
76 return !bl_is_sector_init(be->be_inval, isect);
77}
78
79/* Given the be associated with isect, determine if page data can be
80 * written to disk.
81 */
82static int is_writable(struct pnfs_block_extent *be, sector_t isect)
83{
84 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
85 be->be_state == PNFS_BLOCK_INVALID_DATA);
86}
87
88/* The data we are handed might be spread across several bios. We need
89 * to track when the last one is finished.
90 */
91struct parallel_io {
92 struct kref refcnt;
93 struct rpc_call_ops call_ops;
94 void (*pnfs_callback) (void *data);
95 void *data;
96};
97
98static inline struct parallel_io *alloc_parallel(void *data)
99{
100 struct parallel_io *rv;
101
102 rv = kmalloc(sizeof(*rv), GFP_NOFS);
103 if (rv) {
104 rv->data = data;
105 kref_init(&rv->refcnt);
106 }
107 return rv;
108}
109
110static inline void get_parallel(struct parallel_io *p)
111{
112 kref_get(&p->refcnt);
113}
114
115static void destroy_parallel(struct kref *kref)
116{
117 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
118
119 dprintk("%s enter\n", __func__);
120 p->pnfs_callback(p->data);
121 kfree(p);
122}
123
124static inline void put_parallel(struct parallel_io *p)
125{
126 kref_put(&p->refcnt, destroy_parallel);
127}
128
129static struct bio *
130bl_submit_bio(int rw, struct bio *bio)
131{
132 if (bio) {
133 get_parallel(bio->bi_private);
134 dprintk("%s submitting %s bio %u@%llu\n", __func__,
135 rw == READ ? "read" : "write",
136 bio->bi_size, (unsigned long long)bio->bi_sector);
137 submit_bio(rw, bio);
138 }
139 return NULL;
140}
141
142static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
143 struct pnfs_block_extent *be,
144 void (*end_io)(struct bio *, int err),
145 struct parallel_io *par)
146{
147 struct bio *bio;
148
149 bio = bio_alloc(GFP_NOIO, npg);
150 if (!bio)
151 return NULL;
152
153 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
154 bio->bi_bdev = be->be_mdev;
155 bio->bi_end_io = end_io;
156 bio->bi_private = par;
157 return bio;
158}
159
160static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
161 sector_t isect, struct page *page,
162 struct pnfs_block_extent *be,
163 void (*end_io)(struct bio *, int err),
164 struct parallel_io *par)
165{
166retry:
167 if (!bio) {
168 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
169 if (!bio)
170 return ERR_PTR(-ENOMEM);
171 }
172 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
173 bio = bl_submit_bio(rw, bio);
174 goto retry;
175 }
176 return bio;
177}
178
179/* This is basically copied from mpage_end_io_read */
180static void bl_end_io_read(struct bio *bio, int err)
181{
182 struct parallel_io *par = bio->bi_private;
183 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
184 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
185 struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
186
187 do {
188 struct page *page = bvec->bv_page;
189
190 if (--bvec >= bio->bi_io_vec)
191 prefetchw(&bvec->bv_page->flags);
192 if (uptodate)
193 SetPageUptodate(page);
194 } while (bvec >= bio->bi_io_vec);
195 if (!uptodate) {
196 if (!rdata->pnfs_error)
197 rdata->pnfs_error = -EIO;
198 pnfs_set_lo_fail(rdata->lseg);
199 }
200 bio_put(bio);
201 put_parallel(par);
202}
203
204static void bl_read_cleanup(struct work_struct *work)
205{
206 struct rpc_task *task;
207 struct nfs_read_data *rdata;
208 dprintk("%s enter\n", __func__);
209 task = container_of(work, struct rpc_task, u.tk_work);
210 rdata = container_of(task, struct nfs_read_data, task);
211 pnfs_ld_read_done(rdata);
212}
213
214static void
215bl_end_par_io_read(void *data)
216{
217 struct nfs_read_data *rdata = data;
218
219 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
220 schedule_work(&rdata->task.u.tk_work);
221}
222
223/* We don't want normal .rpc_call_done callback used, so we replace it
224 * with this stub.
225 */
226static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
227{
228 return;
229}
230
231static enum pnfs_try_status
232bl_read_pagelist(struct nfs_read_data *rdata)
233{
234 int i, hole;
235 struct bio *bio = NULL;
236 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
237 sector_t isect, extent_length = 0;
238 struct parallel_io *par;
239 loff_t f_offset = rdata->args.offset;
240 size_t count = rdata->args.count;
241 struct page **pages = rdata->args.pages;
242 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
243
244 dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
245 rdata->npages, f_offset, count);
246
247 par = alloc_parallel(rdata);
248 if (!par)
249 goto use_mds;
250 par->call_ops = *rdata->mds_ops;
251 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
252 par->pnfs_callback = bl_end_par_io_read;
253 /* At this point, we can no longer jump to use_mds */
254
255 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
256 /* Code assumes extents are page-aligned */
257 for (i = pg_index; i < rdata->npages; i++) {
258 if (!extent_length) {
259 /* We've used up the previous extent */
260 bl_put_extent(be);
261 bl_put_extent(cow_read);
262 bio = bl_submit_bio(READ, bio);
263 /* Get the next one */
264 be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
265 isect, &cow_read);
266 if (!be) {
267 rdata->pnfs_error = -EIO;
268 goto out;
269 }
270 extent_length = be->be_length -
271 (isect - be->be_f_offset);
272 if (cow_read) {
273 sector_t cow_length = cow_read->be_length -
274 (isect - cow_read->be_f_offset);
275 extent_length = min(extent_length, cow_length);
276 }
277 }
278 hole = is_hole(be, isect);
279 if (hole && !cow_read) {
280 bio = bl_submit_bio(READ, bio);
281 /* Fill hole w/ zeroes w/o accessing device */
282 dprintk("%s Zeroing page for hole\n", __func__);
283 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
284 print_page(pages[i]);
285 SetPageUptodate(pages[i]);
286 } else {
287 struct pnfs_block_extent *be_read;
288
289 be_read = (hole && cow_read) ? cow_read : be;
290 bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
291 isect, pages[i], be_read,
292 bl_end_io_read, par);
293 if (IS_ERR(bio)) {
294 rdata->pnfs_error = PTR_ERR(bio);
295 bio = NULL;
296 goto out;
297 }
298 }
299 isect += PAGE_CACHE_SECTORS;
300 extent_length -= PAGE_CACHE_SECTORS;
301 }
302 if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
303 rdata->res.eof = 1;
304 rdata->res.count = rdata->inode->i_size - f_offset;
305 } else {
306 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
307 }
308out:
309 bl_put_extent(be);
310 bl_put_extent(cow_read);
311 bl_submit_bio(READ, bio);
312 put_parallel(par);
313 return PNFS_ATTEMPTED;
314
315 use_mds:
316 dprintk("Giving up and using normal NFS\n");
317 return PNFS_NOT_ATTEMPTED;
318}
319
320static void mark_extents_written(struct pnfs_block_layout *bl,
321 __u64 offset, __u32 count)
322{
323 sector_t isect, end;
324 struct pnfs_block_extent *be;
325
326 dprintk("%s(%llu, %u)\n", __func__, offset, count);
327 if (count == 0)
328 return;
329 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
330 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
331 end >>= SECTOR_SHIFT;
332 while (isect < end) {
333 sector_t len;
334 be = bl_find_get_extent(bl, isect, NULL);
335 BUG_ON(!be); /* FIXME */
336 len = min(end, be->be_f_offset + be->be_length) - isect;
337 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
338 bl_mark_for_commit(be, isect, len); /* What if fails? */
339 isect += len;
340 bl_put_extent(be);
341 }
342}
343
344static void bl_end_io_write_zero(struct bio *bio, int err)
345{
346 struct parallel_io *par = bio->bi_private;
347 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
348 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
349 struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
350
351 do {
352 struct page *page = bvec->bv_page;
353
354 if (--bvec >= bio->bi_io_vec)
355 prefetchw(&bvec->bv_page->flags);
356 /* This is the zeroing page we added */
357 end_page_writeback(page);
358 page_cache_release(page);
359 } while (bvec >= bio->bi_io_vec);
360 if (!uptodate) {
361 if (!wdata->pnfs_error)
362 wdata->pnfs_error = -EIO;
363 pnfs_set_lo_fail(wdata->lseg);
364 }
365 bio_put(bio);
366 put_parallel(par);
367}
368
369/* This is basically copied from mpage_end_io_read */
370static void bl_end_io_write(struct bio *bio, int err)
371{
372 struct parallel_io *par = bio->bi_private;
373 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
374 struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
375
376 if (!uptodate) {
377 if (!wdata->pnfs_error)
378 wdata->pnfs_error = -EIO;
379 pnfs_set_lo_fail(wdata->lseg);
380 }
381 bio_put(bio);
382 put_parallel(par);
383}
384
385/* Function scheduled for call during bl_end_par_io_write,
386 * it marks sectors as written and extends the commitlist.
387 */
388static void bl_write_cleanup(struct work_struct *work)
389{
390 struct rpc_task *task;
391 struct nfs_write_data *wdata;
392 dprintk("%s enter\n", __func__);
393 task = container_of(work, struct rpc_task, u.tk_work);
394 wdata = container_of(task, struct nfs_write_data, task);
395 if (!wdata->pnfs_error) {
396 /* Marks for LAYOUTCOMMIT */
397 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
398 wdata->args.offset, wdata->args.count);
399 }
400 pnfs_ld_write_done(wdata);
401}
402
403/* Called when last of bios associated with a bl_write_pagelist call finishes */
404static void bl_end_par_io_write(void *data)
405{
406 struct nfs_write_data *wdata = data;
407
408 wdata->task.tk_status = 0;
409 wdata->verf.committed = NFS_FILE_SYNC;
410 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
411 schedule_work(&wdata->task.u.tk_work);
412}
413
414/* FIXME STUB - mark intersection of layout and page as bad, so is not
415 * used again.
416 */
417static void mark_bad_read(void)
418{
419 return;
420}
421
422/*
423 * map_block: map a requested I/0 block (isect) into an offset in the LVM
424 * block_device
425 */
426static void
427map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
428{
429 dprintk("%s enter be=%p\n", __func__, be);
430
431 set_buffer_mapped(bh);
432 bh->b_bdev = be->be_mdev;
433 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
434 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
435
436 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
437 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
438 bh->b_size);
439 return;
440}
441
442/* Given an unmapped page, zero it or read in page for COW, page is locked
443 * by caller.
444 */
445static int
446init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
447{
448 struct buffer_head *bh = NULL;
449 int ret = 0;
450 sector_t isect;
451
452 dprintk("%s enter, %p\n", __func__, page);
453 BUG_ON(PageUptodate(page));
454 if (!cow_read) {
455 zero_user_segment(page, 0, PAGE_SIZE);
456 SetPageUptodate(page);
457 goto cleanup;
458 }
459
460 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
461 if (!bh) {
462 ret = -ENOMEM;
463 goto cleanup;
464 }
465
466 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
467 map_block(bh, isect, cow_read);
468 if (!bh_uptodate_or_lock(bh))
469 ret = bh_submit_read(bh);
470 if (ret)
471 goto cleanup;
472 SetPageUptodate(page);
473
474cleanup:
475 bl_put_extent(cow_read);
476 if (bh)
477 free_buffer_head(bh);
478 if (ret) {
479 /* Need to mark layout with bad read...should now
480 * just use nfs4 for reads and writes.
481 */
482 mark_bad_read();
483 }
484 return ret;
485}
486
487static enum pnfs_try_status
488bl_write_pagelist(struct nfs_write_data *wdata, int sync)
489{
490 int i, ret, npg_zero, pg_index, last = 0;
491 struct bio *bio = NULL;
492 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
493 sector_t isect, last_isect = 0, extent_length = 0;
494 struct parallel_io *par;
495 loff_t offset = wdata->args.offset;
496 size_t count = wdata->args.count;
497 struct page **pages = wdata->args.pages;
498 struct page *page;
499 pgoff_t index;
500 u64 temp;
501 int npg_per_block =
502 NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
503
504 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
505 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
506 * We want to write each, and if there is an error set pnfs_error
507 * to have it redone using nfs.
508 */
509 par = alloc_parallel(wdata);
510 if (!par)
511 return PNFS_NOT_ATTEMPTED;
512 par->call_ops = *wdata->mds_ops;
513 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
514 par->pnfs_callback = bl_end_par_io_write;
515 /* At this point, have to be more careful with error handling */
516
517 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
518 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
519 if (!be || !is_writable(be, isect)) {
520 dprintk("%s no matching extents!\n", __func__);
521 wdata->pnfs_error = -EINVAL;
522 goto out;
523 }
524
525 /* First page inside INVALID extent */
526 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
527 temp = offset >> PAGE_CACHE_SHIFT;
528 npg_zero = do_div(temp, npg_per_block);
529 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
530 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
531 extent_length = be->be_length - (isect - be->be_f_offset);
532
533fill_invalid_ext:
534 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
535 for (;npg_zero > 0; npg_zero--) {
536 if (bl_is_sector_init(be->be_inval, isect)) {
537 dprintk("isect %llu already init\n",
538 (unsigned long long)isect);
539 goto next_page;
540 }
541 /* page ref released in bl_end_io_write_zero */
542 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
543 dprintk("%s zero %dth page: index %lu isect %llu\n",
544 __func__, npg_zero, index,
545 (unsigned long long)isect);
546 page =
547 find_or_create_page(wdata->inode->i_mapping, index,
548 GFP_NOFS);
549 if (!page) {
550 dprintk("%s oom\n", __func__);
551 wdata->pnfs_error = -ENOMEM;
552 goto out;
553 }
554
555 /* PageDirty: Other will write this out
556 * PageWriteback: Other is writing this out
557 * PageUptodate: It was read before
558 * sector_initialized: already written out
559 */
560 if (PageDirty(page) || PageWriteback(page)) {
561 print_page(page);
562 unlock_page(page);
563 page_cache_release(page);
564 goto next_page;
565 }
566 if (!PageUptodate(page)) {
567 /* New page, readin or zero it */
568 init_page_for_write(page, cow_read);
569 }
570 set_page_writeback(page);
571 unlock_page(page);
572
573 ret = bl_mark_sectors_init(be->be_inval, isect,
574 PAGE_CACHE_SECTORS,
575 NULL);
576 if (unlikely(ret)) {
577 dprintk("%s bl_mark_sectors_init fail %d\n",
578 __func__, ret);
579 end_page_writeback(page);
580 page_cache_release(page);
581 wdata->pnfs_error = ret;
582 goto out;
583 }
584 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
585 isect, page, be,
586 bl_end_io_write_zero, par);
587 if (IS_ERR(bio)) {
588 wdata->pnfs_error = PTR_ERR(bio);
589 bio = NULL;
590 goto out;
591 }
592 /* FIXME: This should be done in bi_end_io */
593 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
594 page->index << PAGE_CACHE_SHIFT,
595 PAGE_CACHE_SIZE);
596next_page:
597 isect += PAGE_CACHE_SECTORS;
598 extent_length -= PAGE_CACHE_SECTORS;
599 }
600 if (last)
601 goto write_done;
602 }
603 bio = bl_submit_bio(WRITE, bio);
604
605 /* Middle pages */
606 pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
607 for (i = pg_index; i < wdata->npages; i++) {
608 if (!extent_length) {
609 /* We've used up the previous extent */
610 bl_put_extent(be);
611 bio = bl_submit_bio(WRITE, bio);
612 /* Get the next one */
613 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
614 isect, NULL);
615 if (!be || !is_writable(be, isect)) {
616 wdata->pnfs_error = -EINVAL;
617 goto out;
618 }
619 extent_length = be->be_length -
620 (isect - be->be_f_offset);
621 }
622 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
623 ret = bl_mark_sectors_init(be->be_inval, isect,
624 PAGE_CACHE_SECTORS,
625 NULL);
626 if (unlikely(ret)) {
627 dprintk("%s bl_mark_sectors_init fail %d\n",
628 __func__, ret);
629 wdata->pnfs_error = ret;
630 goto out;
631 }
632 }
633 bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
634 isect, pages[i], be,
635 bl_end_io_write, par);
636 if (IS_ERR(bio)) {
637 wdata->pnfs_error = PTR_ERR(bio);
638 bio = NULL;
639 goto out;
640 }
641 isect += PAGE_CACHE_SECTORS;
642 last_isect = isect;
643 extent_length -= PAGE_CACHE_SECTORS;
644 }
645
646 /* Last page inside INVALID extent */
647 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
648 bio = bl_submit_bio(WRITE, bio);
649 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
650 npg_zero = npg_per_block - do_div(temp, npg_per_block);
651 if (npg_zero < npg_per_block) {
652 last = 1;
653 goto fill_invalid_ext;
654 }
655 }
656
657write_done:
658 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
659 if (count < wdata->res.count) {
660 wdata->res.count = count;
661 }
662out:
663 bl_put_extent(be);
664 bl_submit_bio(WRITE, bio);
665 put_parallel(par);
666 return PNFS_ATTEMPTED;
667}
668
669/* FIXME - range ignored */
670static void
671release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
672{
673 int i;
674 struct pnfs_block_extent *be;
675
676 spin_lock(&bl->bl_ext_lock);
677 for (i = 0; i < EXTENT_LISTS; i++) {
678 while (!list_empty(&bl->bl_extents[i])) {
679 be = list_first_entry(&bl->bl_extents[i],
680 struct pnfs_block_extent,
681 be_node);
682 list_del(&be->be_node);
683 bl_put_extent(be);
684 }
685 }
686 spin_unlock(&bl->bl_ext_lock);
687}
688
689static void
690release_inval_marks(struct pnfs_inval_markings *marks)
691{
692 struct pnfs_inval_tracking *pos, *temp;
693
694 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
695 list_del(&pos->it_link);
696 kfree(pos);
697 }
698 return;
699}
700
701static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
702{
703 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
704
705 dprintk("%s enter\n", __func__);
706 release_extents(bl, NULL);
707 release_inval_marks(&bl->bl_inval);
708 kfree(bl);
709}
710
711static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
712 gfp_t gfp_flags)
713{
714 struct pnfs_block_layout *bl;
715
716 dprintk("%s enter\n", __func__);
717 bl = kzalloc(sizeof(*bl), gfp_flags);
718 if (!bl)
719 return NULL;
720 spin_lock_init(&bl->bl_ext_lock);
721 INIT_LIST_HEAD(&bl->bl_extents[0]);
722 INIT_LIST_HEAD(&bl->bl_extents[1]);
723 INIT_LIST_HEAD(&bl->bl_commit);
724 INIT_LIST_HEAD(&bl->bl_committing);
725 bl->bl_count = 0;
726 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
727 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
728 return &bl->bl_layout;
729}
730
731static void bl_free_lseg(struct pnfs_layout_segment *lseg)
732{
733 dprintk("%s enter\n", __func__);
734 kfree(lseg);
735}
736
737/* We pretty much ignore lseg, and store all data layout wide, so we
738 * can correctly merge.
739 */
740static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
741 struct nfs4_layoutget_res *lgr,
742 gfp_t gfp_flags)
743{
744 struct pnfs_layout_segment *lseg;
745 int status;
746
747 dprintk("%s enter\n", __func__);
748 lseg = kzalloc(sizeof(*lseg), gfp_flags);
749 if (!lseg)
750 return ERR_PTR(-ENOMEM);
751 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
752 if (status) {
753 /* We don't want to call the full-blown bl_free_lseg,
754 * since on error extents were not touched.
755 */
756 kfree(lseg);
757 return ERR_PTR(status);
758 }
759 return lseg;
760}
761
762static void
763bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
764 const struct nfs4_layoutcommit_args *arg)
765{
766 dprintk("%s enter\n", __func__);
767 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
768}
769
770static void
771bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
772{
773 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
774
775 dprintk("%s enter\n", __func__);
776 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
777}
778
779static void free_blk_mountid(struct block_mount_id *mid)
780{
781 if (mid) {
782 struct pnfs_block_dev *dev;
783 spin_lock(&mid->bm_lock);
784 while (!list_empty(&mid->bm_devlist)) {
785 dev = list_first_entry(&mid->bm_devlist,
786 struct pnfs_block_dev,
787 bm_node);
788 list_del(&dev->bm_node);
789 bl_free_block_dev(dev);
790 }
791 spin_unlock(&mid->bm_lock);
792 kfree(mid);
793 }
794}
795
796/* This is mostly copied from the filelayout's get_device_info function.
797 * It seems much of this should be at the generic pnfs level.
798 */
799static struct pnfs_block_dev *
800nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
801 struct nfs4_deviceid *d_id)
802{
803 struct pnfs_device *dev;
804 struct pnfs_block_dev *rv;
805 u32 max_resp_sz;
806 int max_pages;
807 struct page **pages = NULL;
808 int i, rc;
809
810 /*
811 * Use the session max response size as the basis for setting
812 * GETDEVICEINFO's maxcount
813 */
814 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
815 max_pages = max_resp_sz >> PAGE_SHIFT;
816 dprintk("%s max_resp_sz %u max_pages %d\n",
817 __func__, max_resp_sz, max_pages);
818
819 dev = kmalloc(sizeof(*dev), GFP_NOFS);
820 if (!dev) {
821 dprintk("%s kmalloc failed\n", __func__);
822 return ERR_PTR(-ENOMEM);
823 }
824
825 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
826 if (pages == NULL) {
827 kfree(dev);
828 return ERR_PTR(-ENOMEM);
829 }
830 for (i = 0; i < max_pages; i++) {
831 pages[i] = alloc_page(GFP_NOFS);
832 if (!pages[i]) {
833 rv = ERR_PTR(-ENOMEM);
834 goto out_free;
835 }
836 }
837
838 memcpy(&dev->dev_id, d_id, sizeof(*d_id));
839 dev->layout_type = LAYOUT_BLOCK_VOLUME;
840 dev->pages = pages;
841 dev->pgbase = 0;
842 dev->pglen = PAGE_SIZE * max_pages;
843 dev->mincount = 0;
844
845 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
846 rc = nfs4_proc_getdeviceinfo(server, dev);
847 dprintk("%s getdevice info returns %d\n", __func__, rc);
848 if (rc) {
849 rv = ERR_PTR(rc);
850 goto out_free;
851 }
852
853 rv = nfs4_blk_decode_device(server, dev);
854 out_free:
855 for (i = 0; i < max_pages; i++)
856 __free_page(pages[i]);
857 kfree(pages);
858 kfree(dev);
859 return rv;
860}
861
862static int
863bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
864{
865 struct block_mount_id *b_mt_id = NULL;
866 struct pnfs_devicelist *dlist = NULL;
867 struct pnfs_block_dev *bdev;
868 LIST_HEAD(block_disklist);
869 int status, i;
870
871 dprintk("%s enter\n", __func__);
872
873 if (server->pnfs_blksize == 0) {
874 dprintk("%s Server did not return blksize\n", __func__);
875 return -EINVAL;
876 }
877 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
878 if (!b_mt_id) {
879 status = -ENOMEM;
880 goto out_error;
881 }
882 /* Initialize nfs4 block layout mount id */
883 spin_lock_init(&b_mt_id->bm_lock);
884 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
885
886 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
887 if (!dlist) {
888 status = -ENOMEM;
889 goto out_error;
890 }
891 dlist->eof = 0;
892 while (!dlist->eof) {
893 status = nfs4_proc_getdevicelist(server, fh, dlist);
894 if (status)
895 goto out_error;
896 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
897 __func__, dlist->num_devs, dlist->eof);
898 for (i = 0; i < dlist->num_devs; i++) {
899 bdev = nfs4_blk_get_deviceinfo(server, fh,
900 &dlist->dev_id[i]);
901 if (IS_ERR(bdev)) {
902 status = PTR_ERR(bdev);
903 goto out_error;
904 }
905 spin_lock(&b_mt_id->bm_lock);
906 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
907 spin_unlock(&b_mt_id->bm_lock);
908 }
909 }
910 dprintk("%s SUCCESS\n", __func__);
911 server->pnfs_ld_data = b_mt_id;
912
913 out_return:
914 kfree(dlist);
915 return status;
916
917 out_error:
918 free_blk_mountid(b_mt_id);
919 goto out_return;
920}
921
922static int
923bl_clear_layoutdriver(struct nfs_server *server)
924{
925 struct block_mount_id *b_mt_id = server->pnfs_ld_data;
926
927 dprintk("%s enter\n", __func__);
928 free_blk_mountid(b_mt_id);
929 dprintk("%s RETURNS\n", __func__);
930 return 0;
931}
932
933static const struct nfs_pageio_ops bl_pg_read_ops = {
934 .pg_init = pnfs_generic_pg_init_read,
935 .pg_test = pnfs_generic_pg_test,
936 .pg_doio = pnfs_generic_pg_readpages,
937};
938
939static const struct nfs_pageio_ops bl_pg_write_ops = {
940 .pg_init = pnfs_generic_pg_init_write,
941 .pg_test = pnfs_generic_pg_test,
942 .pg_doio = pnfs_generic_pg_writepages,
943};
944
945static struct pnfs_layoutdriver_type blocklayout_type = {
946 .id = LAYOUT_BLOCK_VOLUME,
947 .name = "LAYOUT_BLOCK_VOLUME",
948 .read_pagelist = bl_read_pagelist,
949 .write_pagelist = bl_write_pagelist,
950 .alloc_layout_hdr = bl_alloc_layout_hdr,
951 .free_layout_hdr = bl_free_layout_hdr,
952 .alloc_lseg = bl_alloc_lseg,
953 .free_lseg = bl_free_lseg,
954 .encode_layoutcommit = bl_encode_layoutcommit,
955 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
956 .set_layoutdriver = bl_set_layoutdriver,
957 .clear_layoutdriver = bl_clear_layoutdriver,
958 .pg_read_ops = &bl_pg_read_ops,
959 .pg_write_ops = &bl_pg_write_ops,
960};
961
962static const struct rpc_pipe_ops bl_upcall_ops = {
963 .upcall = rpc_pipe_generic_upcall,
964 .downcall = bl_pipe_downcall,
965 .destroy_msg = bl_pipe_destroy_msg,
966};
967
968static int __init nfs4blocklayout_init(void)
969{
970 struct vfsmount *mnt;
971 struct path path;
972 int ret;
973
974 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
975
976 ret = pnfs_register_layoutdriver(&blocklayout_type);
977 if (ret)
978 goto out;
979
980 init_waitqueue_head(&bl_wq);
981
982 mnt = rpc_get_mount();
983 if (IS_ERR(mnt)) {
984 ret = PTR_ERR(mnt);
985 goto out_remove;
986 }
987
988 ret = vfs_path_lookup(mnt->mnt_root,
989 mnt,
990 NFS_PIPE_DIRNAME, 0, &path);
991 if (ret)
992 goto out_putrpc;
993
994 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
995 &bl_upcall_ops, 0);
996 path_put(&path);
997 if (IS_ERR(bl_device_pipe)) {
998 ret = PTR_ERR(bl_device_pipe);
999 goto out_putrpc;
1000 }
1001out:
1002 return ret;
1003
1004out_putrpc:
1005 rpc_put_mount();
1006out_remove:
1007 pnfs_unregister_layoutdriver(&blocklayout_type);
1008 return ret;
1009}
1010
1011static void __exit nfs4blocklayout_exit(void)
1012{
1013 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1014 __func__);
1015
1016 pnfs_unregister_layoutdriver(&blocklayout_type);
1017 rpc_unlink(bl_device_pipe);
1018 rpc_put_mount();
1019}
1020
1021MODULE_ALIAS("nfs-layouttype4-3");
1022
1023module_init(nfs4blocklayout_init);
1024module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 00000000000..42acf7ef599
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,205 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
33#define FS_NFS_NFS4BLOCKLAYOUT_H
34
35#include <linux/device-mapper.h>
36#include <linux/nfs_fs.h>
37#include <linux/sunrpc/rpc_pipe_fs.h>
38
39#include "../pnfs.h"
40
41#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
42#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
43
44struct block_mount_id {
45 spinlock_t bm_lock; /* protects list */
46 struct list_head bm_devlist; /* holds pnfs_block_dev */
47};
48
49struct pnfs_block_dev {
50 struct list_head bm_node;
51 struct nfs4_deviceid bm_mdevid; /* associated devid */
52 struct block_device *bm_mdev; /* meta device itself */
53};
54
55enum exstate4 {
56 PNFS_BLOCK_READWRITE_DATA = 0,
57 PNFS_BLOCK_READ_DATA = 1,
58 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
59 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
60};
61
62#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
63
64struct my_tree {
65 sector_t mtt_step_size; /* Internal sector alignment */
66 struct list_head mtt_stub; /* Should be a radix tree */
67};
68
69struct pnfs_inval_markings {
70 spinlock_t im_lock;
71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
72 sector_t im_block_size; /* Server blocksize in sectors */
73};
74
75struct pnfs_inval_tracking {
76 struct list_head it_link;
77 int it_sector;
78 int it_tags;
79};
80
81/* sector_t fields are all in 512-byte sectors */
82struct pnfs_block_extent {
83 struct kref be_refcnt;
84 struct list_head be_node; /* link into lseg list */
85 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
86 struct block_device *be_mdev;
87 sector_t be_f_offset; /* the starting offset in the file */
88 sector_t be_length; /* the size of the extent */
89 sector_t be_v_offset; /* the starting offset in the volume */
90 enum exstate4 be_state; /* the state of this extent */
91 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
92};
93
94/* Shortened extent used by LAYOUTCOMMIT */
95struct pnfs_block_short_extent {
96 struct list_head bse_node;
97 struct nfs4_deviceid bse_devid;
98 struct block_device *bse_mdev;
99 sector_t bse_f_offset; /* the starting offset in the file */
100 sector_t bse_length; /* the size of the extent */
101};
102
103static inline void
104BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
105{
106 spin_lock_init(&marks->im_lock);
107 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
108 marks->im_block_size = blocksize;
109 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
110 blocksize);
111}
112
113enum extentclass4 {
114 RW_EXTENT = 0, /* READWRTE and INVAL */
115 RO_EXTENT = 1, /* READ and NONE */
116 EXTENT_LISTS = 2,
117};
118
119static inline int bl_choose_list(enum exstate4 state)
120{
121 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
122 return RO_EXTENT;
123 else
124 return RW_EXTENT;
125}
126
127struct pnfs_block_layout {
128 struct pnfs_layout_hdr bl_layout;
129 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
130 spinlock_t bl_ext_lock; /* Protects list manipulation */
131 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
132 struct list_head bl_commit; /* Needs layout commit */
133 struct list_head bl_committing; /* Layout committing */
134 unsigned int bl_count; /* entries in bl_commit */
135 sector_t bl_blocksize; /* Server blocksize in sectors */
136};
137
138#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
139
140static inline struct pnfs_block_layout *
141BLK_LO2EXT(struct pnfs_layout_hdr *lo)
142{
143 return container_of(lo, struct pnfs_block_layout, bl_layout);
144}
145
146static inline struct pnfs_block_layout *
147BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
148{
149 return BLK_LO2EXT(lseg->pls_layout);
150}
151
152struct bl_dev_msg {
153 int32_t status;
154 uint32_t major, minor;
155};
156
157struct bl_msg_hdr {
158 u8 type;
159 u16 totallen; /* length of entire message, including hdr itself */
160};
161
162extern struct dentry *bl_device_pipe;
163extern wait_queue_head_t bl_wq;
164
165#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
166#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
167#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
168#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
169#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
170
171/* blocklayoutdev.c */
172ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
173void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
174struct block_device *nfs4_blkdev_get(dev_t dev);
175int nfs4_blkdev_put(struct block_device *bdev);
176struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
177 struct pnfs_device *dev);
178int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
179 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
180
181/* blocklayoutdm.c */
182void bl_free_block_dev(struct pnfs_block_dev *bdev);
183
184/* extents.c */
185struct pnfs_block_extent *
186bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
187 struct pnfs_block_extent **cow_read);
188int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
189 sector_t offset, sector_t length,
190 sector_t **pages);
191void bl_put_extent(struct pnfs_block_extent *be);
192struct pnfs_block_extent *bl_alloc_extent(void);
193int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
194int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
195 struct xdr_stream *xdr,
196 const struct nfs4_layoutcommit_args *arg);
197void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
198 const struct nfs4_layoutcommit_args *arg,
199 int status);
200int bl_add_merge_extent(struct pnfs_block_layout *bl,
201 struct pnfs_block_extent *new);
202int bl_mark_for_commit(struct pnfs_block_extent *be,
203 sector_t offset, sector_t length);
204
205#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644
index 00000000000..d08ba9107fd
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -0,0 +1,391 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
3 *
4 * Device operations for the pnfs nfs4 file layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#include <linux/module.h>
33#include <linux/buffer_head.h> /* __bread */
34
35#include <linux/genhd.h>
36#include <linux/blkdev.h>
37#include <linux/hash.h>
38
39#include "blocklayout.h"
40
41#define NFSDBG_FACILITY NFSDBG_PNFS_LD
42
43static int decode_sector_number(__be32 **rp, sector_t *sp)
44{
45 uint64_t s;
46
47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) {
49 printk(KERN_WARNING "%s: sector not aligned\n", __func__);
50 return -1;
51 }
52 *sp = s >> SECTOR_SHIFT;
53 return 0;
54}
55
56/* Open a block_device by device number. */
57struct block_device *nfs4_blkdev_get(dev_t dev)
58{
59 struct block_device *bd;
60
61 dprintk("%s enter\n", __func__);
62 bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
63 if (IS_ERR(bd))
64 goto fail;
65 return bd;
66fail:
67 dprintk("%s failed to open device : %ld\n",
68 __func__, PTR_ERR(bd));
69 return NULL;
70}
71
72/*
73 * Release the block device
74 */
75int nfs4_blkdev_put(struct block_device *bdev)
76{
77 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
78 MINOR(bdev->bd_dev));
79 return blkdev_put(bdev, FMODE_READ);
80}
81
82static struct bl_dev_msg bl_mount_reply;
83
84ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
85 size_t mlen)
86{
87 if (mlen != sizeof (struct bl_dev_msg))
88 return -EINVAL;
89
90 if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
91 return -EFAULT;
92
93 wake_up(&bl_wq);
94
95 return mlen;
96}
97
98void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
99{
100 if (msg->errno >= 0)
101 return;
102 wake_up(&bl_wq);
103}
104
105/*
106 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
107 */
108struct pnfs_block_dev *
109nfs4_blk_decode_device(struct nfs_server *server,
110 struct pnfs_device *dev)
111{
112 struct pnfs_block_dev *rv;
113 struct block_device *bd = NULL;
114 struct rpc_pipe_msg msg;
115 struct bl_msg_hdr bl_msg = {
116 .type = BL_DEVICE_MOUNT,
117 .totallen = dev->mincount,
118 };
119 uint8_t *dataptr;
120 DECLARE_WAITQUEUE(wq, current);
121 struct bl_dev_msg *reply = &bl_mount_reply;
122 int offset, len, i, rc;
123
124 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
125 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
126 dev->mincount);
127
128 memset(&msg, 0, sizeof(msg));
129 msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
130 if (!msg.data) {
131 rv = ERR_PTR(-ENOMEM);
132 goto out;
133 }
134
135 memcpy(msg.data, &bl_msg, sizeof(bl_msg));
136 dataptr = (uint8_t *) msg.data;
137 len = dev->mincount;
138 offset = sizeof(bl_msg);
139 for (i = 0; len > 0; i++) {
140 memcpy(&dataptr[offset], page_address(dev->pages[i]),
141 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
142 len -= PAGE_CACHE_SIZE;
143 offset += PAGE_CACHE_SIZE;
144 }
145 msg.len = sizeof(bl_msg) + dev->mincount;
146
147 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
148 add_wait_queue(&bl_wq, &wq);
149 rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
150 if (rc < 0) {
151 remove_wait_queue(&bl_wq, &wq);
152 rv = ERR_PTR(rc);
153 goto out;
154 }
155
156 set_current_state(TASK_UNINTERRUPTIBLE);
157 schedule();
158 __set_current_state(TASK_RUNNING);
159 remove_wait_queue(&bl_wq, &wq);
160
161 if (reply->status != BL_DEVICE_REQUEST_PROC) {
162 dprintk("%s failed to open device: %d\n",
163 __func__, reply->status);
164 rv = ERR_PTR(-EINVAL);
165 goto out;
166 }
167
168 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
169 if (IS_ERR(bd)) {
170 rc = PTR_ERR(bd);
171 dprintk("%s failed to open device : %d\n", __func__, rc);
172 rv = ERR_PTR(rc);
173 goto out;
174 }
175
176 rv = kzalloc(sizeof(*rv), GFP_NOFS);
177 if (!rv) {
178 rv = ERR_PTR(-ENOMEM);
179 goto out;
180 }
181
182 rv->bm_mdev = bd;
183 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
184 dprintk("%s Created device %s with bd_block_size %u\n",
185 __func__,
186 bd->bd_disk->disk_name,
187 bd->bd_block_size);
188
189out:
190 kfree(msg.data);
191 return rv;
192}
193
194/* Map deviceid returned by the server to constructed block_device */
195static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
196 struct nfs4_deviceid *id)
197{
198 struct block_device *rv = NULL;
199 struct block_mount_id *mid;
200 struct pnfs_block_dev *dev;
201
202 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
203 mid = BLK_ID(lo);
204 spin_lock(&mid->bm_lock);
205 list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
206 if (memcmp(id->data, dev->bm_mdevid.data,
207 NFS4_DEVICEID4_SIZE) == 0) {
208 rv = dev->bm_mdev;
209 goto out;
210 }
211 }
212 out:
213 spin_unlock(&mid->bm_lock);
214 dprintk("%s returning %p\n", __func__, rv);
215 return rv;
216}
217
218/* Tracks info needed to ensure extents in layout obey constraints of spec */
219struct layout_verification {
220 u32 mode; /* R or RW */
221 u64 start; /* Expected start of next non-COW extent */
222 u64 inval; /* Start of INVAL coverage */
223 u64 cowread; /* End of COW read coverage */
224};
225
226/* Verify the extent meets the layout requirements of the pnfs-block draft,
227 * section 2.3.1.
228 */
229static int verify_extent(struct pnfs_block_extent *be,
230 struct layout_verification *lv)
231{
232 if (lv->mode == IOMODE_READ) {
233 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
234 be->be_state == PNFS_BLOCK_INVALID_DATA)
235 return -EIO;
236 if (be->be_f_offset != lv->start)
237 return -EIO;
238 lv->start += be->be_length;
239 return 0;
240 }
241 /* lv->mode == IOMODE_RW */
242 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
243 if (be->be_f_offset != lv->start)
244 return -EIO;
245 if (lv->cowread > lv->start)
246 return -EIO;
247 lv->start += be->be_length;
248 lv->inval = lv->start;
249 return 0;
250 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
251 if (be->be_f_offset != lv->start)
252 return -EIO;
253 lv->start += be->be_length;
254 return 0;
255 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
256 if (be->be_f_offset > lv->start)
257 return -EIO;
258 if (be->be_f_offset < lv->inval)
259 return -EIO;
260 if (be->be_f_offset < lv->cowread)
261 return -EIO;
262 /* It looks like you might want to min this with lv->start,
263 * but you really don't.
264 */
265 lv->inval = lv->inval + be->be_length;
266 lv->cowread = be->be_f_offset + be->be_length;
267 return 0;
268 } else
269 return -EIO;
270}
271
272/* XDR decode pnfs_block_layout4 structure */
273int
274nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
275 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
276{
277 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
278 int i, status = -EIO;
279 uint32_t count;
280 struct pnfs_block_extent *be = NULL, *save;
281 struct xdr_stream stream;
282 struct xdr_buf buf;
283 struct page *scratch;
284 __be32 *p;
285 struct layout_verification lv = {
286 .mode = lgr->range.iomode,
287 .start = lgr->range.offset >> SECTOR_SHIFT,
288 .inval = lgr->range.offset >> SECTOR_SHIFT,
289 .cowread = lgr->range.offset >> SECTOR_SHIFT,
290 };
291 LIST_HEAD(extents);
292
293 dprintk("---> %s\n", __func__);
294
295 scratch = alloc_page(gfp_flags);
296 if (!scratch)
297 return -ENOMEM;
298
299 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
300 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
301
302 p = xdr_inline_decode(&stream, 4);
303 if (unlikely(!p))
304 goto out_err;
305
306 count = be32_to_cpup(p++);
307
308 dprintk("%s enter, number of extents %i\n", __func__, count);
309 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
310 if (unlikely(!p))
311 goto out_err;
312
313 /* Decode individual extents, putting them in temporary
314 * staging area until whole layout is decoded to make error
315 * recovery easier.
316 */
317 for (i = 0; i < count; i++) {
318 be = bl_alloc_extent();
319 if (!be) {
320 status = -ENOMEM;
321 goto out_err;
322 }
323 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
324 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
325 be->be_mdev = translate_devid(lo, &be->be_devid);
326 if (!be->be_mdev)
327 goto out_err;
328
329 /* The next three values are read in as bytes,
330 * but stored as 512-byte sector lengths
331 */
332 if (decode_sector_number(&p, &be->be_f_offset) < 0)
333 goto out_err;
334 if (decode_sector_number(&p, &be->be_length) < 0)
335 goto out_err;
336 if (decode_sector_number(&p, &be->be_v_offset) < 0)
337 goto out_err;
338 be->be_state = be32_to_cpup(p++);
339 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
340 be->be_inval = &bl->bl_inval;
341 if (verify_extent(be, &lv)) {
342 dprintk("%s verify failed\n", __func__);
343 goto out_err;
344 }
345 list_add_tail(&be->be_node, &extents);
346 }
347 if (lgr->range.offset + lgr->range.length !=
348 lv.start << SECTOR_SHIFT) {
349 dprintk("%s Final length mismatch\n", __func__);
350 be = NULL;
351 goto out_err;
352 }
353 if (lv.start < lv.cowread) {
354 dprintk("%s Final uncovered COW extent\n", __func__);
355 be = NULL;
356 goto out_err;
357 }
358 /* Extents decoded properly, now try to merge them in to
359 * existing layout extents.
360 */
361 spin_lock(&bl->bl_ext_lock);
362 list_for_each_entry_safe(be, save, &extents, be_node) {
363 list_del(&be->be_node);
364 status = bl_add_merge_extent(bl, be);
365 if (status) {
366 spin_unlock(&bl->bl_ext_lock);
367 /* This is a fairly catastrophic error, as the
368 * entire layout extent lists are now corrupted.
369 * We should have some way to distinguish this.
370 */
371 be = NULL;
372 goto out_err;
373 }
374 }
375 spin_unlock(&bl->bl_ext_lock);
376 status = 0;
377 out:
378 __free_page(scratch);
379 dprintk("%s returns %i\n", __func__, status);
380 return status;
381
382 out_err:
383 bl_put_extent(be);
384 while (!list_empty(&extents)) {
385 be = list_first_entry(&extents, struct pnfs_block_extent,
386 be_node);
387 list_del(&be->be_node);
388 bl_put_extent(be);
389 }
390 goto out;
391}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644
index 00000000000..d055c755807
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -0,0 +1,111 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdm.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2007 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Fred Isaman <iisaman@umich.edu>
10 * Andy Adamson <andros@citi.umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/genhd.h> /* gendisk - used in a dprintk*/
34#include <linux/sched.h>
35#include <linux/hash.h>
36
37#include "blocklayout.h"
38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40
41static void dev_remove(dev_t dev)
42{
43 struct rpc_pipe_msg msg;
44 struct bl_dev_msg bl_umount_request;
45 struct bl_msg_hdr bl_msg = {
46 .type = BL_DEVICE_UMOUNT,
47 .totallen = sizeof(bl_umount_request),
48 };
49 uint8_t *dataptr;
50 DECLARE_WAITQUEUE(wq, current);
51
52 dprintk("Entering %s\n", __func__);
53
54 memset(&msg, 0, sizeof(msg));
55 msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
56 if (!msg.data)
57 goto out;
58
59 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
60 bl_umount_request.major = MAJOR(dev);
61 bl_umount_request.minor = MINOR(dev);
62
63 memcpy(msg.data, &bl_msg, sizeof(bl_msg));
64 dataptr = (uint8_t *) msg.data;
65 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
66 msg.len = sizeof(bl_msg) + bl_msg.totallen;
67
68 add_wait_queue(&bl_wq, &wq);
69 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
70 remove_wait_queue(&bl_wq, &wq);
71 goto out;
72 }
73
74 set_current_state(TASK_UNINTERRUPTIBLE);
75 schedule();
76 __set_current_state(TASK_RUNNING);
77 remove_wait_queue(&bl_wq, &wq);
78
79out:
80 kfree(msg.data);
81}
82
83/*
84 * Release meta device
85 */
86static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
87{
88 int rv;
89
90 dprintk("%s Releasing\n", __func__);
91 rv = nfs4_blkdev_put(bdev->bm_mdev);
92 if (rv)
93 printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
94 __func__, rv);
95
96 dev_remove(bdev->bm_mdev->bd_dev);
97}
98
99void bl_free_block_dev(struct pnfs_block_dev *bdev)
100{
101 if (bdev) {
102 if (bdev->bm_mdev) {
103 dprintk("%s Removing DM device: %d:%d\n",
104 __func__,
105 MAJOR(bdev->bm_mdev->bd_dev),
106 MINOR(bdev->bm_mdev->bd_dev));
107 nfs4_blk_metadev_release(bdev);
108 }
109 kfree(bdev);
110 }
111}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644
index 00000000000..19fa7b0b8c0
--- /dev/null
+++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,935 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - do_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 if (storage)
114 new = storage;
115 else {
116 new = kmalloc(sizeof(*new), GFP_NOFS);
117 if (!new)
118 return -ENOMEM;
119 }
120 new->it_sector = s;
121 new->it_tags = (1 << tag);
122 list_add(&new->it_link, &pos->it_link);
123 return 1;
124 }
125}
126
127/* XXXX Really want option to not create */
128/* Over range, unions tag with existing entries, else creates entry with tag */
129static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
130{
131 u64 i;
132
133 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
134 for (i = normalize(s, tree->mtt_step_size); i < s + length;
135 i += tree->mtt_step_size)
136 if (_add_entry(tree, i, tag, NULL))
137 return -ENOMEM;
138 return 0;
139}
140
141/* Ensure that future operations on given range of tree will not malloc */
142static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
143{
144 u64 start, end, s;
145 int count, i, used = 0, status = -ENOMEM;
146 struct pnfs_inval_tracking **storage;
147
148 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
149 start = normalize(offset, tree->mtt_step_size);
150 end = normalize_up(offset + length, tree->mtt_step_size);
151 count = (int)(end - start) / (int)tree->mtt_step_size;
152
153 /* Pre-malloc what memory we might need */
154 storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
155 if (!storage)
156 return -ENOMEM;
157 for (i = 0; i < count; i++) {
158 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
159 GFP_NOFS);
160 if (!storage[i])
161 goto out_cleanup;
162 }
163
164 /* Now need lock - HOW??? */
165
166 for (s = start; s < end; s += tree->mtt_step_size)
167 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
168
169 /* Unlock - HOW??? */
170 status = 0;
171
172 out_cleanup:
173 for (i = used; i < count; i++) {
174 if (!storage[i])
175 break;
176 kfree(storage[i]);
177 }
178 kfree(storage);
179 return status;
180}
181
182static void set_needs_init(sector_t *array, sector_t offset)
183{
184 sector_t *p = array;
185
186 dprintk("%s enter\n", __func__);
187 if (!p)
188 return;
189 while (*p < offset)
190 p++;
191 if (*p == offset)
192 return;
193 else if (*p == ~0) {
194 *p++ = offset;
195 *p = ~0;
196 return;
197 } else {
198 sector_t *save = p;
199 dprintk("%s Adding %llu\n", __func__, (u64)offset);
200 while (*p != ~0)
201 p++;
202 p++;
203 memmove(save + 1, save, (char *)p - (char *)save);
204 *save = offset;
205 return;
206 }
207}
208
209/* We are relying on page lock to serialize this */
210int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
211{
212 int rv;
213
214 spin_lock(&marks->im_lock);
215 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
216 spin_unlock(&marks->im_lock);
217 return rv;
218}
219
220/* Assume start, end already sector aligned */
221static int
222_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
223{
224 struct pnfs_inval_tracking *pos;
225 u64 expect = 0;
226
227 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
228 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
229 if (pos->it_sector >= end)
230 continue;
231 if (!expect) {
232 if ((pos->it_sector == end - tree->mtt_step_size) &&
233 (pos->it_tags & (1 << tag))) {
234 expect = pos->it_sector - tree->mtt_step_size;
235 if (pos->it_sector < tree->mtt_step_size || expect < start)
236 return 1;
237 continue;
238 } else {
239 return 0;
240 }
241 }
242 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
243 return 0;
244 expect -= tree->mtt_step_size;
245 if (expect < start)
246 return 1;
247 }
248 return 0;
249}
250
251static int is_range_written(struct pnfs_inval_markings *marks,
252 sector_t start, sector_t end)
253{
254 int rv;
255
256 spin_lock(&marks->im_lock);
257 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
258 spin_unlock(&marks->im_lock);
259 return rv;
260}
261
262/* Marks sectors in [offest, offset_length) as having been initialized.
263 * All lengths are step-aligned, where step is min(pagesize, blocksize).
264 * Notes where partial block is initialized, and helps prepare it for
265 * complete initialization later.
266 */
267/* Currently assumes offset is page-aligned */
268int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
269 sector_t offset, sector_t length,
270 sector_t **pages)
271{
272 sector_t s, start, end;
273 sector_t *array = NULL; /* Pages to mark */
274
275 dprintk("%s(offset=%llu,len=%llu) enter\n",
276 __func__, (u64)offset, (u64)length);
277 s = max((sector_t) 3,
278 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
279 dprintk("%s set max=%llu\n", __func__, (u64)s);
280 if (pages) {
281 array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
282 if (!array)
283 goto outerr;
284 array[0] = ~0;
285 }
286
287 start = normalize(offset, marks->im_block_size);
288 end = normalize_up(offset + length, marks->im_block_size);
289 if (_preload_range(&marks->im_tree, start, end - start))
290 goto outerr;
291
292 spin_lock(&marks->im_lock);
293
294 for (s = normalize_up(start, PAGE_CACHE_SECTORS);
295 s < offset; s += PAGE_CACHE_SECTORS) {
296 dprintk("%s pre-area pages\n", __func__);
297 /* Portion of used block is not initialized */
298 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
299 set_needs_init(array, s);
300 }
301 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
302 goto out_unlock;
303 for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
304 s < end; s += PAGE_CACHE_SECTORS) {
305 dprintk("%s post-area pages\n", __func__);
306 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
307 set_needs_init(array, s);
308 }
309
310 spin_unlock(&marks->im_lock);
311
312 if (pages) {
313 if (array[0] == ~0) {
314 kfree(array);
315 *pages = NULL;
316 } else
317 *pages = array;
318 }
319 return 0;
320
321 out_unlock:
322 spin_unlock(&marks->im_lock);
323 outerr:
324 if (pages) {
325 kfree(array);
326 *pages = NULL;
327 }
328 return -ENOMEM;
329}
330
331/* Marks sectors in [offest, offset+length) as having been written to disk.
332 * All lengths should be block aligned.
333 */
334static int mark_written_sectors(struct pnfs_inval_markings *marks,
335 sector_t offset, sector_t length)
336{
337 int status;
338
339 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
340 (u64)offset, (u64)length);
341 spin_lock(&marks->im_lock);
342 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
343 spin_unlock(&marks->im_lock);
344 return status;
345}
346
347static void print_short_extent(struct pnfs_block_short_extent *be)
348{
349 dprintk("PRINT SHORT EXTENT extent %p\n", be);
350 if (be) {
351 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
352 dprintk(" be_length %llu\n", (u64)be->bse_length);
353 }
354}
355
356static void print_clist(struct list_head *list, unsigned int count)
357{
358 struct pnfs_block_short_extent *be;
359 unsigned int i = 0;
360
361 ifdebug(FACILITY) {
362 printk(KERN_DEBUG "****************\n");
363 printk(KERN_DEBUG "Extent list looks like:\n");
364 list_for_each_entry(be, list, bse_node) {
365 i++;
366 print_short_extent(be);
367 }
368 if (i != count)
369 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
370 printk(KERN_DEBUG "****************\n");
371 }
372}
373
374/* Note: In theory, we should do more checking that devid's match between
375 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
376 */
377/* Note this is very similar to bl_add_merge_extent */
378static void add_to_commitlist(struct pnfs_block_layout *bl,
379 struct pnfs_block_short_extent *new)
380{
381 struct list_head *clist = &bl->bl_commit;
382 struct pnfs_block_short_extent *old, *save;
383 sector_t end = new->bse_f_offset + new->bse_length;
384
385 dprintk("%s enter\n", __func__);
386 print_short_extent(new);
387 print_clist(clist, bl->bl_count);
388 bl->bl_count++;
389 /* Scan for proper place to insert, extending new to the left
390 * as much as possible.
391 */
392 list_for_each_entry_safe(old, save, clist, bse_node) {
393 if (new->bse_f_offset < old->bse_f_offset)
394 break;
395 if (end <= old->bse_f_offset + old->bse_length) {
396 /* Range is already in list */
397 bl->bl_count--;
398 kfree(new);
399 return;
400 } else if (new->bse_f_offset <=
401 old->bse_f_offset + old->bse_length) {
402 /* new overlaps or abuts existing be */
403 if (new->bse_mdev == old->bse_mdev) {
404 /* extend new to fully replace old */
405 new->bse_length += new->bse_f_offset -
406 old->bse_f_offset;
407 new->bse_f_offset = old->bse_f_offset;
408 list_del(&old->bse_node);
409 bl->bl_count--;
410 kfree(old);
411 }
412 }
413 }
414 /* Note that if we never hit the above break, old will not point to a
415 * valid extent. However, in that case &old->bse_node==list.
416 */
417 list_add_tail(&new->bse_node, &old->bse_node);
418 /* Scan forward for overlaps. If we find any, extend new and
419 * remove the overlapped extent.
420 */
421 old = list_prepare_entry(new, clist, bse_node);
422 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
423 if (end < old->bse_f_offset)
424 break;
425 /* new overlaps or abuts old */
426 if (new->bse_mdev == old->bse_mdev) {
427 if (end < old->bse_f_offset + old->bse_length) {
428 /* extend new to fully cover old */
429 end = old->bse_f_offset + old->bse_length;
430 new->bse_length = end - new->bse_f_offset;
431 }
432 list_del(&old->bse_node);
433 bl->bl_count--;
434 kfree(old);
435 }
436 }
437 dprintk("%s: after merging\n", __func__);
438 print_clist(clist, bl->bl_count);
439}
440
441/* Note the range described by offset, length is guaranteed to be contained
442 * within be.
443 */
444int bl_mark_for_commit(struct pnfs_block_extent *be,
445 sector_t offset, sector_t length)
446{
447 sector_t new_end, end = offset + length;
448 struct pnfs_block_short_extent *new;
449 struct pnfs_block_layout *bl = container_of(be->be_inval,
450 struct pnfs_block_layout,
451 bl_inval);
452
453 new = kmalloc(sizeof(*new), GFP_NOFS);
454 if (!new)
455 return -ENOMEM;
456
457 mark_written_sectors(be->be_inval, offset, length);
458 /* We want to add the range to commit list, but it must be
459 * block-normalized, and verified that the normalized range has
460 * been entirely written to disk.
461 */
462 new->bse_f_offset = offset;
463 offset = normalize(offset, bl->bl_blocksize);
464 if (offset < new->bse_f_offset) {
465 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
466 new->bse_f_offset = offset;
467 else
468 new->bse_f_offset = offset + bl->bl_blocksize;
469 }
470 new_end = normalize_up(end, bl->bl_blocksize);
471 if (end < new_end) {
472 if (is_range_written(be->be_inval, end, new_end))
473 end = new_end;
474 else
475 end = new_end - bl->bl_blocksize;
476 }
477 if (end <= new->bse_f_offset) {
478 kfree(new);
479 return 0;
480 }
481 new->bse_length = end - new->bse_f_offset;
482 new->bse_devid = be->be_devid;
483 new->bse_mdev = be->be_mdev;
484
485 spin_lock(&bl->bl_ext_lock);
486 /* new will be freed, either by add_to_commitlist if it decides not
487 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
488 */
489 add_to_commitlist(bl, new);
490 spin_unlock(&bl->bl_ext_lock);
491 return 0;
492}
493
494static void print_bl_extent(struct pnfs_block_extent *be)
495{
496 dprintk("PRINT EXTENT extent %p\n", be);
497 if (be) {
498 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
499 dprintk(" be_length %llu\n", (u64)be->be_length);
500 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
501 dprintk(" be_state %d\n", be->be_state);
502 }
503}
504
505static void
506destroy_extent(struct kref *kref)
507{
508 struct pnfs_block_extent *be;
509
510 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
511 dprintk("%s be=%p\n", __func__, be);
512 kfree(be);
513}
514
515void
516bl_put_extent(struct pnfs_block_extent *be)
517{
518 if (be) {
519 dprintk("%s enter %p (%i)\n", __func__, be,
520 atomic_read(&be->be_refcnt.refcount));
521 kref_put(&be->be_refcnt, destroy_extent);
522 }
523}
524
525struct pnfs_block_extent *bl_alloc_extent(void)
526{
527 struct pnfs_block_extent *be;
528
529 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
530 if (!be)
531 return NULL;
532 INIT_LIST_HEAD(&be->be_node);
533 kref_init(&be->be_refcnt);
534 be->be_inval = NULL;
535 return be;
536}
537
538static void print_elist(struct list_head *list)
539{
540 struct pnfs_block_extent *be;
541 dprintk("****************\n");
542 dprintk("Extent list looks like:\n");
543 list_for_each_entry(be, list, be_node) {
544 print_bl_extent(be);
545 }
546 dprintk("****************\n");
547}
548
549static inline int
550extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
551{
552 /* Note this assumes new->be_f_offset >= old->be_f_offset */
553 return (new->be_state == old->be_state) &&
554 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
555 ((new->be_v_offset - old->be_v_offset ==
556 new->be_f_offset - old->be_f_offset) &&
557 new->be_mdev == old->be_mdev));
558}
559
560/* Adds new to appropriate list in bl, modifying new and removing existing
561 * extents as appropriate to deal with overlaps.
562 *
563 * See bl_find_get_extent for list constraints.
564 *
565 * Refcount on new is already set. If end up not using it, or error out,
566 * need to put the reference.
567 *
568 * bl->bl_ext_lock is held by caller.
569 */
570int
571bl_add_merge_extent(struct pnfs_block_layout *bl,
572 struct pnfs_block_extent *new)
573{
574 struct pnfs_block_extent *be, *tmp;
575 sector_t end = new->be_f_offset + new->be_length;
576 struct list_head *list;
577
578 dprintk("%s enter with be=%p\n", __func__, new);
579 print_bl_extent(new);
580 list = &bl->bl_extents[bl_choose_list(new->be_state)];
581 print_elist(list);
582
583 /* Scan for proper place to insert, extending new to the left
584 * as much as possible.
585 */
586 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
587 if (new->be_f_offset >= be->be_f_offset + be->be_length)
588 break;
589 if (new->be_f_offset >= be->be_f_offset) {
590 if (end <= be->be_f_offset + be->be_length) {
591 /* new is a subset of existing be*/
592 if (extents_consistent(be, new)) {
593 dprintk("%s: new is subset, ignoring\n",
594 __func__);
595 bl_put_extent(new);
596 return 0;
597 } else {
598 goto out_err;
599 }
600 } else {
601 /* |<-- be -->|
602 * |<-- new -->| */
603 if (extents_consistent(be, new)) {
604 /* extend new to fully replace be */
605 new->be_length += new->be_f_offset -
606 be->be_f_offset;
607 new->be_f_offset = be->be_f_offset;
608 new->be_v_offset = be->be_v_offset;
609 dprintk("%s: removing %p\n", __func__, be);
610 list_del(&be->be_node);
611 bl_put_extent(be);
612 } else {
613 goto out_err;
614 }
615 }
616 } else if (end >= be->be_f_offset + be->be_length) {
617 /* new extent overlap existing be */
618 if (extents_consistent(be, new)) {
619 /* extend new to fully replace be */
620 dprintk("%s: removing %p\n", __func__, be);
621 list_del(&be->be_node);
622 bl_put_extent(be);
623 } else {
624 goto out_err;
625 }
626 } else if (end > be->be_f_offset) {
627 /* |<-- be -->|
628 *|<-- new -->| */
629 if (extents_consistent(new, be)) {
630 /* extend new to fully replace be */
631 new->be_length += be->be_f_offset + be->be_length -
632 new->be_f_offset - new->be_length;
633 dprintk("%s: removing %p\n", __func__, be);
634 list_del(&be->be_node);
635 bl_put_extent(be);
636 } else {
637 goto out_err;
638 }
639 }
640 }
641 /* Note that if we never hit the above break, be will not point to a
642 * valid extent. However, in that case &be->be_node==list.
643 */
644 list_add(&new->be_node, &be->be_node);
645 dprintk("%s: inserting new\n", __func__);
646 print_elist(list);
647 /* FIXME - The per-list consistency checks have all been done,
648 * should now check cross-list consistency.
649 */
650 return 0;
651
652 out_err:
653 bl_put_extent(new);
654 return -EIO;
655}
656
657/* Returns extent, or NULL. If a second READ extent exists, it is returned
658 * in cow_read, if given.
659 *
660 * The extents are kept in two seperate ordered lists, one for READ and NONE,
661 * one for READWRITE and INVALID. Within each list, we assume:
662 * 1. Extents are ordered by file offset.
663 * 2. For any given isect, there is at most one extents that matches.
664 */
665struct pnfs_block_extent *
666bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
667 struct pnfs_block_extent **cow_read)
668{
669 struct pnfs_block_extent *be, *cow, *ret;
670 int i;
671
672 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
673 cow = ret = NULL;
674 spin_lock(&bl->bl_ext_lock);
675 for (i = 0; i < EXTENT_LISTS; i++) {
676 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
677 if (isect >= be->be_f_offset + be->be_length)
678 break;
679 if (isect >= be->be_f_offset) {
680 /* We have found an extent */
681 dprintk("%s Get %p (%i)\n", __func__, be,
682 atomic_read(&be->be_refcnt.refcount));
683 kref_get(&be->be_refcnt);
684 if (!ret)
685 ret = be;
686 else if (be->be_state != PNFS_BLOCK_READ_DATA)
687 bl_put_extent(be);
688 else
689 cow = be;
690 break;
691 }
692 }
693 if (ret &&
694 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
695 break;
696 }
697 spin_unlock(&bl->bl_ext_lock);
698 if (cow_read)
699 *cow_read = cow;
700 print_bl_extent(ret);
701 return ret;
702}
703
704/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
705static struct pnfs_block_extent *
706bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
707{
708 struct pnfs_block_extent *be, *ret = NULL;
709 int i;
710
711 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
712 for (i = 0; i < EXTENT_LISTS; i++) {
713 if (ret)
714 break;
715 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
716 if (isect >= be->be_f_offset + be->be_length)
717 break;
718 if (isect >= be->be_f_offset) {
719 /* We have found an extent */
720 dprintk("%s Get %p (%i)\n", __func__, be,
721 atomic_read(&be->be_refcnt.refcount));
722 kref_get(&be->be_refcnt);
723 ret = be;
724 break;
725 }
726 }
727 }
728 print_bl_extent(ret);
729 return ret;
730}
731
732int
733encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
734 struct xdr_stream *xdr,
735 const struct nfs4_layoutcommit_args *arg)
736{
737 struct pnfs_block_short_extent *lce, *save;
738 unsigned int count = 0;
739 __be32 *p, *xdr_start;
740
741 dprintk("%s enter\n", __func__);
742 /* BUG - creation of bl_commit is buggy - need to wait for
743 * entire block to be marked WRITTEN before it can be added.
744 */
745 spin_lock(&bl->bl_ext_lock);
746 /* Want to adjust for possible truncate */
747 /* We now want to adjust argument range */
748
749 /* XDR encode the ranges found */
750 xdr_start = xdr_reserve_space(xdr, 8);
751 if (!xdr_start)
752 goto out;
753 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
754 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
755 if (!p)
756 break;
757 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
758 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
759 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
760 p = xdr_encode_hyper(p, 0LL);
761 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
762 list_del(&lce->bse_node);
763 list_add_tail(&lce->bse_node, &bl->bl_committing);
764 bl->bl_count--;
765 count++;
766 }
767 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
768 xdr_start[1] = cpu_to_be32(count);
769out:
770 spin_unlock(&bl->bl_ext_lock);
771 dprintk("%s found %i ranges\n", __func__, count);
772 return 0;
773}
774
775/* Helper function to set_to_rw that initialize a new extent */
776static void
777_prep_new_extent(struct pnfs_block_extent *new,
778 struct pnfs_block_extent *orig,
779 sector_t offset, sector_t length, int state)
780{
781 kref_init(&new->be_refcnt);
782 /* don't need to INIT_LIST_HEAD(&new->be_node) */
783 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
784 new->be_mdev = orig->be_mdev;
785 new->be_f_offset = offset;
786 new->be_length = length;
787 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
788 new->be_state = state;
789 new->be_inval = orig->be_inval;
790}
791
792/* Tries to merge be with extent in front of it in list.
793 * Frees storage if not used.
794 */
795static struct pnfs_block_extent *
796_front_merge(struct pnfs_block_extent *be, struct list_head *head,
797 struct pnfs_block_extent *storage)
798{
799 struct pnfs_block_extent *prev;
800
801 if (!storage)
802 goto no_merge;
803 if (&be->be_node == head || be->be_node.prev == head)
804 goto no_merge;
805 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
806 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
807 !extents_consistent(prev, be))
808 goto no_merge;
809 _prep_new_extent(storage, prev, prev->be_f_offset,
810 prev->be_length + be->be_length, prev->be_state);
811 list_replace(&prev->be_node, &storage->be_node);
812 bl_put_extent(prev);
813 list_del(&be->be_node);
814 bl_put_extent(be);
815 return storage;
816
817 no_merge:
818 kfree(storage);
819 return be;
820}
821
822static u64
823set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
824{
825 u64 rv = offset + length;
826 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
827 struct pnfs_block_extent *children[3];
828 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
829 int i = 0, j;
830
831 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
832 /* Create storage for up to three new extents e1, e2, e3 */
833 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
834 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
835 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
836 /* BUG - we are ignoring any failure */
837 if (!e1 || !e2 || !e3)
838 goto out_nosplit;
839
840 spin_lock(&bl->bl_ext_lock);
841 be = bl_find_get_extent_locked(bl, offset);
842 rv = be->be_f_offset + be->be_length;
843 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
844 spin_unlock(&bl->bl_ext_lock);
845 goto out_nosplit;
846 }
847 /* Add e* to children, bumping e*'s krefs */
848 if (be->be_f_offset != offset) {
849 _prep_new_extent(e1, be, be->be_f_offset,
850 offset - be->be_f_offset,
851 PNFS_BLOCK_INVALID_DATA);
852 children[i++] = e1;
853 print_bl_extent(e1);
854 } else
855 merge1 = e1;
856 _prep_new_extent(e2, be, offset,
857 min(length, be->be_f_offset + be->be_length - offset),
858 PNFS_BLOCK_READWRITE_DATA);
859 children[i++] = e2;
860 print_bl_extent(e2);
861 if (offset + length < be->be_f_offset + be->be_length) {
862 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
863 be->be_f_offset + be->be_length -
864 offset - length,
865 PNFS_BLOCK_INVALID_DATA);
866 children[i++] = e3;
867 print_bl_extent(e3);
868 } else
869 merge2 = e3;
870
871 /* Remove be from list, and insert the e* */
872 /* We don't get refs on e*, since this list is the base reference
873 * set when init'ed.
874 */
875 if (i < 3)
876 children[i] = NULL;
877 new = children[0];
878 list_replace(&be->be_node, &new->be_node);
879 bl_put_extent(be);
880 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
881 for (j = 1; j < i; j++) {
882 old = new;
883 new = children[j];
884 list_add(&new->be_node, &old->be_node);
885 }
886 if (merge2) {
887 /* This is a HACK, should just create a _back_merge function */
888 new = list_entry(new->be_node.next,
889 struct pnfs_block_extent, be_node);
890 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
891 }
892 spin_unlock(&bl->bl_ext_lock);
893
894 /* Since we removed the base reference above, be is now scheduled for
895 * destruction.
896 */
897 bl_put_extent(be);
898 dprintk("%s returns %llu after split\n", __func__, rv);
899 return rv;
900
901 out_nosplit:
902 kfree(e1);
903 kfree(e2);
904 kfree(e3);
905 dprintk("%s returns %llu without splitting\n", __func__, rv);
906 return rv;
907}
908
909void
910clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
911 const struct nfs4_layoutcommit_args *arg,
912 int status)
913{
914 struct pnfs_block_short_extent *lce, *save;
915
916 dprintk("%s status %d\n", __func__, status);
917 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
918 if (likely(!status)) {
919 u64 offset = lce->bse_f_offset;
920 u64 end = offset + lce->bse_length;
921
922 do {
923 offset = set_to_rw(bl, offset, end - offset);
924 } while (offset < end);
925 list_del(&lce->bse_node);
926
927 kfree(lce);
928 } else {
929 list_del(&lce->bse_node);
930 spin_lock(&bl->bl_ext_lock);
931 add_to_commitlist(bl, lce);
932 spin_unlock(&bl->bl_ext_lock);
933 }
934 }
935}
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 84690319e62..c98b439332f 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -113,19 +113,18 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
113 113
114int nfs_cache_register(struct cache_detail *cd) 114int nfs_cache_register(struct cache_detail *cd)
115{ 115{
116 struct nameidata nd;
117 struct vfsmount *mnt; 116 struct vfsmount *mnt;
117 struct path path;
118 int ret; 118 int ret;
119 119
120 mnt = rpc_get_mount(); 120 mnt = rpc_get_mount();
121 if (IS_ERR(mnt)) 121 if (IS_ERR(mnt))
122 return PTR_ERR(mnt); 122 return PTR_ERR(mnt);
123 ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); 123 ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path);
124 if (ret) 124 if (ret)
125 goto err; 125 goto err;
126 ret = sunrpc_cache_register_pipefs(nd.path.dentry, 126 ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd);
127 cd->name, 0600, cd); 127 path_put(&path);
128 path_put(&nd.path);
129 if (!ret) 128 if (!ret)
130 return ret; 129 return ret;
131err: 130err:
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 76f856e284e..7cf6cafcc00 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -6,7 +6,7 @@
6 6
7#include <linux/completion.h> 7#include <linux/completion.h>
8#include <linux/sunrpc/cache.h> 8#include <linux/sunrpc/cache.h>
9#include <asm/atomic.h> 9#include <linux/atomic.h>
10 10
11/* 11/*
12 * Deferred request handling 12 * Deferred request handling
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b257383bb56..07df5f1d85e 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
38struct cb_process_state { 38struct cb_process_state {
39 __be32 drc_status; 39 __be32 drc_status;
40 struct nfs_client *clp; 40 struct nfs_client *clp;
41 int slotid;
41}; 42};
42 43
43struct cb_compound_hdr_arg { 44struct cb_compound_hdr_arg {
@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(
166 void *dummy, struct cb_process_state *cps); 167 void *dummy, struct cb_process_state *cps);
167 168
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 169extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170 170
171struct cb_devicenotifyitem { 171struct cb_devicenotifyitem {
172 uint32_t cbd_notify_type; 172 uint32_t cbd_notify_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index d4d1954e9bb..54cea8ad5a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -111,6 +111,7 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
111static u32 initiate_file_draining(struct nfs_client *clp, 111static u32 initiate_file_draining(struct nfs_client *clp,
112 struct cb_layoutrecallargs *args) 112 struct cb_layoutrecallargs *args)
113{ 113{
114 struct nfs_server *server;
114 struct pnfs_layout_hdr *lo; 115 struct pnfs_layout_hdr *lo;
115 struct inode *ino; 116 struct inode *ino;
116 bool found = false; 117 bool found = false;
@@ -118,21 +119,28 @@ static u32 initiate_file_draining(struct nfs_client *clp,
118 LIST_HEAD(free_me_list); 119 LIST_HEAD(free_me_list);
119 120
120 spin_lock(&clp->cl_lock); 121 spin_lock(&clp->cl_lock);
121 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { 122 rcu_read_lock();
122 if (nfs_compare_fh(&args->cbl_fh, 123 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
123 &NFS_I(lo->plh_inode)->fh)) 124 list_for_each_entry(lo, &server->layouts, plh_layouts) {
124 continue; 125 if (nfs_compare_fh(&args->cbl_fh,
125 ino = igrab(lo->plh_inode); 126 &NFS_I(lo->plh_inode)->fh))
126 if (!ino) 127 continue;
127 continue; 128 ino = igrab(lo->plh_inode);
128 found = true; 129 if (!ino)
129 /* Without this, layout can be freed as soon 130 continue;
130 * as we release cl_lock. 131 found = true;
131 */ 132 /* Without this, layout can be freed as soon
132 get_layout_hdr(lo); 133 * as we release cl_lock.
133 break; 134 */
135 get_layout_hdr(lo);
136 break;
137 }
138 if (found)
139 break;
134 } 140 }
141 rcu_read_unlock();
135 spin_unlock(&clp->cl_lock); 142 spin_unlock(&clp->cl_lock);
143
136 if (!found) 144 if (!found)
137 return NFS4ERR_NOMATCHING_LAYOUT; 145 return NFS4ERR_NOMATCHING_LAYOUT;
138 146
@@ -154,6 +162,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
154static u32 initiate_bulk_draining(struct nfs_client *clp, 162static u32 initiate_bulk_draining(struct nfs_client *clp,
155 struct cb_layoutrecallargs *args) 163 struct cb_layoutrecallargs *args)
156{ 164{
165 struct nfs_server *server;
157 struct pnfs_layout_hdr *lo; 166 struct pnfs_layout_hdr *lo;
158 struct inode *ino; 167 struct inode *ino;
159 u32 rv = NFS4ERR_NOMATCHING_LAYOUT; 168 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -167,18 +176,24 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
167 }; 176 };
168 177
169 spin_lock(&clp->cl_lock); 178 spin_lock(&clp->cl_lock);
170 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { 179 rcu_read_lock();
180 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
171 if ((args->cbl_recall_type == RETURN_FSID) && 181 if ((args->cbl_recall_type == RETURN_FSID) &&
172 memcmp(&NFS_SERVER(lo->plh_inode)->fsid, 182 memcmp(&server->fsid, &args->cbl_fsid,
173 &args->cbl_fsid, sizeof(struct nfs_fsid))) 183 sizeof(struct nfs_fsid)))
174 continue;
175 if (!igrab(lo->plh_inode))
176 continue; 184 continue;
177 get_layout_hdr(lo); 185
178 BUG_ON(!list_empty(&lo->plh_bulk_recall)); 186 list_for_each_entry(lo, &server->layouts, plh_layouts) {
179 list_add(&lo->plh_bulk_recall, &recall_list); 187 if (!igrab(lo->plh_inode))
188 continue;
189 get_layout_hdr(lo);
190 BUG_ON(!list_empty(&lo->plh_bulk_recall));
191 list_add(&lo->plh_bulk_recall, &recall_list);
192 }
180 } 193 }
194 rcu_read_unlock();
181 spin_unlock(&clp->cl_lock); 195 spin_unlock(&clp->cl_lock);
196
182 list_for_each_entry_safe(lo, tmp, 197 list_for_each_entry_safe(lo, tmp,
183 &recall_list, plh_bulk_recall) { 198 &recall_list, plh_bulk_recall) {
184 ino = lo->plh_inode; 199 ino = lo->plh_inode;
@@ -324,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
324 dprintk("%s enter. slotid %d seqid %d\n", 339 dprintk("%s enter. slotid %d seqid %d\n",
325 __func__, args->csa_slotid, args->csa_sequenceid); 340 __func__, args->csa_slotid, args->csa_sequenceid);
326 341
327 if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) 342 if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
328 return htonl(NFS4ERR_BADSLOT); 343 return htonl(NFS4ERR_BADSLOT);
329 344
330 slot = tbl->slots + args->csa_slotid; 345 slot = tbl->slots + args->csa_slotid;
@@ -333,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
333 /* Normal */ 348 /* Normal */
334 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { 349 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
335 slot->seq_nr++; 350 slot->seq_nr++;
336 return htonl(NFS4_OK); 351 goto out_ok;
337 } 352 }
338 353
339 /* Replay */ 354 /* Replay */
@@ -352,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
352 /* Wraparound */ 367 /* Wraparound */
353 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { 368 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
354 slot->seq_nr = 1; 369 slot->seq_nr = 1;
355 return htonl(NFS4_OK); 370 goto out_ok;
356 } 371 }
357 372
358 /* Misordered request */ 373 /* Misordered request */
359 return htonl(NFS4ERR_SEQ_MISORDERED); 374 return htonl(NFS4ERR_SEQ_MISORDERED);
375out_ok:
376 tbl->highest_used_slotid = args->csa_slotid;
377 return htonl(NFS4_OK);
360} 378}
361 379
362/* 380/*
@@ -418,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
418 struct cb_sequenceres *res, 436 struct cb_sequenceres *res,
419 struct cb_process_state *cps) 437 struct cb_process_state *cps)
420{ 438{
439 struct nfs4_slot_table *tbl;
421 struct nfs_client *clp; 440 struct nfs_client *clp;
422 int i; 441 int i;
423 __be32 status = htonl(NFS4ERR_BADSESSION); 442 __be32 status = htonl(NFS4ERR_BADSESSION);
424 443
425 cps->clp = NULL;
426
427 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); 444 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
428 if (clp == NULL) 445 if (clp == NULL)
429 goto out; 446 goto out;
430 447
448 tbl = &clp->cl_session->bc_slot_table;
449
450 spin_lock(&tbl->slot_tbl_lock);
431 /* state manager is resetting the session */ 451 /* state manager is resetting the session */
432 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { 452 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
433 status = NFS4ERR_DELAY; 453 spin_unlock(&tbl->slot_tbl_lock);
454 status = htonl(NFS4ERR_DELAY);
455 /* Return NFS4ERR_BADSESSION if we're draining the session
456 * in order to reset it.
457 */
458 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
459 status = htonl(NFS4ERR_BADSESSION);
434 goto out; 460 goto out;
435 } 461 }
436 462
437 status = validate_seqid(&clp->cl_session->bc_slot_table, args); 463 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
464 spin_unlock(&tbl->slot_tbl_lock);
438 if (status) 465 if (status)
439 goto out; 466 goto out;
440 467
468 cps->slotid = args->csa_slotid;
469
441 /* 470 /*
442 * Check for pending referring calls. If a match is found, a 471 * Check for pending referring calls. If a match is found, a
443 * related callback was received before the response to the original 472 * related callback was received before the response to the original
@@ -454,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
454 res->csr_slotid = args->csa_slotid; 483 res->csr_slotid = args->csa_slotid;
455 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 484 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
456 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 485 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
457 nfs4_cb_take_slot(clp);
458 486
459out: 487out:
460 cps->clp = clp; /* put in nfs4_callback_compound */ 488 cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c6c86a77e04..918ad647afe 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
754 * Let the state manager know callback processing done. 754 * Let the state manager know callback processing done.
755 * A single slot, so highest used slotid is either 0 or -1 755 * A single slot, so highest used slotid is either 0 or -1
756 */ 756 */
757 tbl->highest_used_slotid--; 757 tbl->highest_used_slotid = -1;
758 nfs4_check_drain_bc_complete(session); 758 nfs4_check_drain_bc_complete(session);
759 spin_unlock(&tbl->slot_tbl_lock); 759 spin_unlock(&tbl->slot_tbl_lock);
760} 760}
761 761
762static void nfs4_cb_free_slot(struct nfs_client *clp) 762static void nfs4_cb_free_slot(struct cb_process_state *cps)
763{ 763{
764 if (clp && clp->cl_session) 764 if (cps->slotid != -1)
765 nfs4_callback_free_slot(clp->cl_session); 765 nfs4_callback_free_slot(cps->clp->cl_session);
766}
767
768/* A single slot, so highest used slotid is either 0 or -1 */
769void nfs4_cb_take_slot(struct nfs_client *clp)
770{
771 struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
772
773 spin_lock(&tbl->slot_tbl_lock);
774 tbl->highest_used_slotid++;
775 BUG_ON(tbl->highest_used_slotid != 0);
776 spin_unlock(&tbl->slot_tbl_lock);
777} 766}
778 767
779#else /* CONFIG_NFS_V4_1 */ 768#else /* CONFIG_NFS_V4_1 */
@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
784 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 773 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
785} 774}
786 775
787static void nfs4_cb_free_slot(struct nfs_client *clp) 776static void nfs4_cb_free_slot(struct cb_process_state *cps)
788{ 777{
789} 778}
790#endif /* CONFIG_NFS_V4_1 */ 779#endif /* CONFIG_NFS_V4_1 */
@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
866 struct cb_process_state cps = { 855 struct cb_process_state cps = {
867 .drc_status = 0, 856 .drc_status = 0,
868 .clp = NULL, 857 .clp = NULL,
858 .slotid = -1,
869 }; 859 };
870 unsigned int nops = 0; 860 unsigned int nops = 0;
871 861
@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
906 896
907 *hdr_res.status = status; 897 *hdr_res.status = status;
908 *hdr_res.nops = htonl(nops); 898 *hdr_res.nops = htonl(nops);
909 nfs4_cb_free_slot(cps.clp); 899 nfs4_cb_free_slot(&cps);
910 nfs_put_client(cps.clp); 900 nfs_put_client(cps.clp);
911 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 901 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
912 return rpc_success; 902 return rpc_success;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b3dc2b88b65..5833fbbf59b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = {
105 .nrvers = ARRAY_SIZE(nfs_version), 105 .nrvers = ARRAY_SIZE(nfs_version),
106 .version = nfs_version, 106 .version = nfs_version,
107 .stats = &nfs_rpcstat, 107 .stats = &nfs_rpcstat,
108 .pipe_dir_name = "/nfs", 108 .pipe_dir_name = NFS_PIPE_DIRNAME,
109}; 109};
110 110
111struct rpc_stat nfs_rpcstat = { 111struct rpc_stat nfs_rpcstat = {
@@ -188,9 +188,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
188 cred = rpc_lookup_machine_cred(); 188 cred = rpc_lookup_machine_cred();
189 if (!IS_ERR(cred)) 189 if (!IS_ERR(cred))
190 clp->cl_machine_cred = cred; 190 clp->cl_machine_cred = cred;
191#if defined(CONFIG_NFS_V4_1)
192 INIT_LIST_HEAD(&clp->cl_layouts);
193#endif
194 nfs_fscache_get_client_cookie(clp); 191 nfs_fscache_get_client_cookie(clp);
195 192
196 return clp; 193 return clp;
@@ -293,6 +290,7 @@ static void nfs_free_client(struct nfs_client *clp)
293 nfs4_deviceid_purge_client(clp); 290 nfs4_deviceid_purge_client(clp);
294 291
295 kfree(clp->cl_hostname); 292 kfree(clp->cl_hostname);
293 kfree(clp->server_scope);
296 kfree(clp); 294 kfree(clp);
297 295
298 dprintk("<-- nfs_free_client()\n"); 296 dprintk("<-- nfs_free_client()\n");
@@ -906,7 +904,9 @@ error:
906/* 904/*
907 * Load up the server record from information gained in an fsinfo record 905 * Load up the server record from information gained in an fsinfo record
908 */ 906 */
909static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) 907static void nfs_server_set_fsinfo(struct nfs_server *server,
908 struct nfs_fh *mntfh,
909 struct nfs_fsinfo *fsinfo)
910{ 910{
911 unsigned long max_rpc_payload; 911 unsigned long max_rpc_payload;
912 912
@@ -936,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
936 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 936 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
937 server->wsize = NFS_MAX_FILE_IO_SIZE; 937 server->wsize = NFS_MAX_FILE_IO_SIZE;
938 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 938 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
939 set_pnfs_layoutdriver(server, fsinfo->layouttype); 939 server->pnfs_blksize = fsinfo->blksize;
940 set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
940 941
941 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 942 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
942 943
@@ -982,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
982 if (error < 0) 983 if (error < 0)
983 goto out_error; 984 goto out_error;
984 985
985 nfs_server_set_fsinfo(server, &fsinfo); 986 nfs_server_set_fsinfo(server, mntfh, &fsinfo);
986 987
987 /* Get some general file system info */ 988 /* Get some general file system info */
988 if (server->namelen == 0) { 989 if (server->namelen == 0) {
@@ -1062,6 +1063,7 @@ static struct nfs_server *nfs_alloc_server(void)
1062 INIT_LIST_HEAD(&server->client_link); 1063 INIT_LIST_HEAD(&server->client_link);
1063 INIT_LIST_HEAD(&server->master_link); 1064 INIT_LIST_HEAD(&server->master_link);
1064 INIT_LIST_HEAD(&server->delegations); 1065 INIT_LIST_HEAD(&server->delegations);
1066 INIT_LIST_HEAD(&server->layouts);
1065 1067
1066 atomic_set(&server->active, 0); 1068 atomic_set(&server->active, 0);
1067 1069
@@ -1464,7 +1466,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
1464 dprintk("<-- %s %p\n", __func__, clp); 1466 dprintk("<-- %s %p\n", __func__, clp);
1465 return clp; 1467 return clp;
1466} 1468}
1467EXPORT_SYMBOL(nfs4_set_ds_client); 1469EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
1468 1470
1469/* 1471/*
1470 * Session has been established, and the client marked ready. 1472 * Session has been established, and the client marked ready.
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index dd25c2aec37..321a66bc384 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -398,12 +398,11 @@ int nfs_inode_return_delegation(struct inode *inode)
398 return err; 398 return err;
399} 399}
400 400
401static void nfs_mark_return_delegation(struct nfs_delegation *delegation) 401static void nfs_mark_return_delegation(struct nfs_server *server,
402 struct nfs_delegation *delegation)
402{ 403{
403 struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
404
405 set_bit(NFS_DELEGATION_RETURN, &delegation->flags); 404 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
406 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); 405 set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
407} 406}
408 407
409/** 408/**
@@ -441,7 +440,7 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
441 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) 440 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
442 continue; 441 continue;
443 if (delegation->type & flags) 442 if (delegation->type & flags)
444 nfs_mark_return_delegation(delegation); 443 nfs_mark_return_delegation(server, delegation);
445 } 444 }
446} 445}
447 446
@@ -508,7 +507,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
508 list_for_each_entry_rcu(delegation, &server->delegations, super_list) { 507 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
509 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) 508 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
510 continue; 509 continue;
511 nfs_mark_return_delegation(delegation); 510 nfs_mark_return_delegation(server, delegation);
512 } 511 }
513} 512}
514 513
@@ -539,7 +538,8 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
539int nfs_async_inode_return_delegation(struct inode *inode, 538int nfs_async_inode_return_delegation(struct inode *inode,
540 const nfs4_stateid *stateid) 539 const nfs4_stateid *stateid)
541{ 540{
542 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 541 struct nfs_server *server = NFS_SERVER(inode);
542 struct nfs_client *clp = server->nfs_client;
543 struct nfs_delegation *delegation; 543 struct nfs_delegation *delegation;
544 544
545 rcu_read_lock(); 545 rcu_read_lock();
@@ -549,7 +549,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,
549 rcu_read_unlock(); 549 rcu_read_unlock();
550 return -ENOENT; 550 return -ENOENT;
551 } 551 }
552 nfs_mark_return_delegation(delegation); 552 nfs_mark_return_delegation(server, delegation);
553 rcu_read_unlock(); 553 rcu_read_unlock();
554 554
555 nfs_delegation_run_state_manager(clp); 555 nfs_delegation_run_state_manager(clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ededdbd0db3..ac289909814 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -56,7 +56,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
56static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); 56static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
57static int nfs_rename(struct inode *, struct dentry *, 57static int nfs_rename(struct inode *, struct dentry *,
58 struct inode *, struct dentry *); 58 struct inode *, struct dentry *);
59static int nfs_fsync_dir(struct file *, int); 59static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
60static loff_t nfs_llseek_dir(struct file *, loff_t, int); 60static loff_t nfs_llseek_dir(struct file *, loff_t, int);
61static void nfs_readdir_clear_array(struct page*); 61static void nfs_readdir_clear_array(struct page*);
62 62
@@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = {
134 134
135#endif /* CONFIG_NFS_V4 */ 135#endif /* CONFIG_NFS_V4 */
136 136
137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) 137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
138{ 138{
139 struct nfs_open_dir_context *ctx; 139 struct nfs_open_dir_context *ctx;
140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
141 if (ctx != NULL) { 141 if (ctx != NULL) {
142 ctx->duped = 0; 142 ctx->duped = 0;
143 ctx->attr_gencount = NFS_I(dir)->attr_gencount;
143 ctx->dir_cookie = 0; 144 ctx->dir_cookie = 0;
144 ctx->dup_cookie = 0; 145 ctx->dup_cookie = 0;
145 ctx->cred = get_rpccred(cred); 146 ctx->cred = get_rpccred(cred);
146 } else 147 return ctx;
147 ctx = ERR_PTR(-ENOMEM); 148 }
148 return ctx; 149 return ERR_PTR(-ENOMEM);
149} 150}
150 151
151static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) 152static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
@@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
173 cred = rpc_lookup_cred(); 174 cred = rpc_lookup_cred();
174 if (IS_ERR(cred)) 175 if (IS_ERR(cred))
175 return PTR_ERR(cred); 176 return PTR_ERR(cred);
176 ctx = alloc_nfs_open_dir_context(cred); 177 ctx = alloc_nfs_open_dir_context(inode, cred);
177 if (IS_ERR(ctx)) { 178 if (IS_ERR(ctx)) {
178 res = PTR_ERR(ctx); 179 res = PTR_ERR(ctx);
179 goto out; 180 goto out;
@@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
323{ 324{
324 loff_t diff = desc->file->f_pos - desc->current_index; 325 loff_t diff = desc->file->f_pos - desc->current_index;
325 unsigned int index; 326 unsigned int index;
326 struct nfs_open_dir_context *ctx = desc->file->private_data;
327 327
328 if (diff < 0) 328 if (diff < 0)
329 goto out_eof; 329 goto out_eof;
@@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
336 index = (unsigned int)diff; 336 index = (unsigned int)diff;
337 *desc->dir_cookie = array->array[index].cookie; 337 *desc->dir_cookie = array->array[index].cookie;
338 desc->cache_entry_index = index; 338 desc->cache_entry_index = index;
339 ctx->duped = 0;
340 return 0; 339 return 0;
341out_eof: 340out_eof:
342 desc->eof = 1; 341 desc->eof = 1;
@@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
349 int i; 348 int i;
350 loff_t new_pos; 349 loff_t new_pos;
351 int status = -EAGAIN; 350 int status = -EAGAIN;
352 struct nfs_open_dir_context *ctx = desc->file->private_data;
353 351
354 for (i = 0; i < array->size; i++) { 352 for (i = 0; i < array->size; i++) {
355 if (array->array[i].cookie == *desc->dir_cookie) { 353 if (array->array[i].cookie == *desc->dir_cookie) {
354 struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
355 struct nfs_open_dir_context *ctx = desc->file->private_data;
356
356 new_pos = desc->current_index + i; 357 new_pos = desc->current_index + i;
357 if (new_pos < desc->file->f_pos) { 358 if (ctx->attr_gencount != nfsi->attr_gencount
359 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
360 ctx->duped = 0;
361 ctx->attr_gencount = nfsi->attr_gencount;
362 } else if (new_pos < desc->file->f_pos) {
363 if (ctx->duped > 0
364 && ctx->dup_cookie == *desc->dir_cookie) {
365 if (printk_ratelimit()) {
366 pr_notice("NFS: directory %s/%s contains a readdir loop."
367 "Please contact your server vendor. "
368 "The file: %s has duplicate cookie %llu\n",
369 desc->file->f_dentry->d_parent->d_name.name,
370 desc->file->f_dentry->d_name.name,
371 array->array[i].string.name,
372 *desc->dir_cookie);
373 }
374 status = -ELOOP;
375 goto out;
376 }
358 ctx->dup_cookie = *desc->dir_cookie; 377 ctx->dup_cookie = *desc->dir_cookie;
359 ctx->duped = 1; 378 ctx->duped = -1;
360 } 379 }
361 desc->file->f_pos = new_pos; 380 desc->file->f_pos = new_pos;
362 desc->cache_entry_index = i; 381 desc->cache_entry_index = i;
@@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
368 if (*desc->dir_cookie == array->last_cookie) 387 if (*desc->dir_cookie == array->last_cookie)
369 desc->eof = 1; 388 desc->eof = 1;
370 } 389 }
390out:
371 return status; 391 return status;
372} 392}
373 393
@@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
740 struct nfs_cache_array *array = NULL; 760 struct nfs_cache_array *array = NULL;
741 struct nfs_open_dir_context *ctx = file->private_data; 761 struct nfs_open_dir_context *ctx = file->private_data;
742 762
743 if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
744 if (printk_ratelimit()) {
745 pr_notice("NFS: directory %s/%s contains a readdir loop. "
746 "Please contact your server vendor. "
747 "Offending cookie: %llu\n",
748 file->f_dentry->d_parent->d_name.name,
749 file->f_dentry->d_name.name,
750 *desc->dir_cookie);
751 }
752 res = -ELOOP;
753 goto out;
754 }
755
756 array = nfs_readdir_get_array(desc->page); 763 array = nfs_readdir_get_array(desc->page);
757 if (IS_ERR(array)) { 764 if (IS_ERR(array)) {
758 res = PTR_ERR(array); 765 res = PTR_ERR(array);
@@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
774 *desc->dir_cookie = array->array[i+1].cookie; 781 *desc->dir_cookie = array->array[i+1].cookie;
775 else 782 else
776 *desc->dir_cookie = array->last_cookie; 783 *desc->dir_cookie = array->last_cookie;
784 if (ctx->duped != 0)
785 ctx->duped = 1;
777 } 786 }
778 if (array->eof_index >= 0) 787 if (array->eof_index >= 0)
779 desc->eof = 1; 788 desc->eof = 1;
@@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
805 struct page *page = NULL; 814 struct page *page = NULL;
806 int status; 815 int status;
807 struct inode *inode = desc->file->f_path.dentry->d_inode; 816 struct inode *inode = desc->file->f_path.dentry->d_inode;
817 struct nfs_open_dir_context *ctx = desc->file->private_data;
808 818
809 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 819 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
810 (unsigned long long)*desc->dir_cookie); 820 (unsigned long long)*desc->dir_cookie);
@@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
818 desc->page_index = 0; 828 desc->page_index = 0;
819 desc->last_cookie = *desc->dir_cookie; 829 desc->last_cookie = *desc->dir_cookie;
820 desc->page = page; 830 desc->page = page;
831 ctx->duped = 0;
821 832
822 status = nfs_readdir_xdr_to_array(desc, page, inode); 833 status = nfs_readdir_xdr_to_array(desc, page, inode);
823 if (status < 0) 834 if (status < 0)
@@ -945,15 +956,19 @@ out:
945 * All directory operations under NFS are synchronous, so fsync() 956 * All directory operations under NFS are synchronous, so fsync()
946 * is a dummy operation. 957 * is a dummy operation.
947 */ 958 */
948static int nfs_fsync_dir(struct file *filp, int datasync) 959static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
960 int datasync)
949{ 961{
950 struct dentry *dentry = filp->f_path.dentry; 962 struct dentry *dentry = filp->f_path.dentry;
963 struct inode *inode = dentry->d_inode;
951 964
952 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", 965 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
953 dentry->d_parent->d_name.name, dentry->d_name.name, 966 dentry->d_parent->d_name.name, dentry->d_name.name,
954 datasync); 967 datasync);
955 968
969 mutex_lock(&inode->i_mutex);
956 nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); 970 nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
971 mutex_unlock(&inode->i_mutex);
957 return 0; 972 return 0;
958} 973}
959 974
@@ -997,14 +1012,12 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
997 * Return the intent data that applies to this particular path component 1012 * Return the intent data that applies to this particular path component
998 * 1013 *
999 * Note that the current set of intents only apply to the very last 1014 * Note that the current set of intents only apply to the very last
1000 * component of the path. 1015 * component of the path and none of them is set before that last
1001 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. 1016 * component.
1002 */ 1017 */
1003static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, 1018static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
1004 unsigned int mask) 1019 unsigned int mask)
1005{ 1020{
1006 if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
1007 return 0;
1008 return nd->flags & mask; 1021 return nd->flags & mask;
1009} 1022}
1010 1023
@@ -1338,25 +1351,31 @@ static int is_atomic_open(struct nameidata *nd)
1338 return 0; 1351 return 0;
1339 /* Are we trying to write to a read only partition? */ 1352 /* Are we trying to write to a read only partition? */
1340 if (__mnt_is_readonly(nd->path.mnt) && 1353 if (__mnt_is_readonly(nd->path.mnt) &&
1341 (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) 1354 (nd->intent.open.flags & (O_CREAT|O_TRUNC|O_ACCMODE)))
1342 return 0; 1355 return 0;
1343 return 1; 1356 return 1;
1344} 1357}
1345 1358
1346static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd) 1359static fmode_t flags_to_mode(int flags)
1360{
1361 fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
1362 if ((flags & O_ACCMODE) != O_WRONLY)
1363 res |= FMODE_READ;
1364 if ((flags & O_ACCMODE) != O_RDONLY)
1365 res |= FMODE_WRITE;
1366 return res;
1367}
1368
1369static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
1347{ 1370{
1348 struct path path = {
1349 .mnt = nd->path.mnt,
1350 .dentry = dentry,
1351 };
1352 struct nfs_open_context *ctx; 1371 struct nfs_open_context *ctx;
1353 struct rpc_cred *cred; 1372 struct rpc_cred *cred;
1354 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); 1373 fmode_t fmode = flags_to_mode(open_flags);
1355 1374
1356 cred = rpc_lookup_cred(); 1375 cred = rpc_lookup_cred();
1357 if (IS_ERR(cred)) 1376 if (IS_ERR(cred))
1358 return ERR_CAST(cred); 1377 return ERR_CAST(cred);
1359 ctx = alloc_nfs_open_context(&path, cred, fmode); 1378 ctx = alloc_nfs_open_context(dentry, cred, fmode);
1360 put_rpccred(cred); 1379 put_rpccred(cred);
1361 if (ctx == NULL) 1380 if (ctx == NULL)
1362 return ERR_PTR(-ENOMEM); 1381 return ERR_PTR(-ENOMEM);
@@ -1376,13 +1395,13 @@ static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ct
1376 1395
1377 /* If the open_intent is for execute, we have an extra check to make */ 1396 /* If the open_intent is for execute, we have an extra check to make */
1378 if (ctx->mode & FMODE_EXEC) { 1397 if (ctx->mode & FMODE_EXEC) {
1379 ret = nfs_may_open(ctx->path.dentry->d_inode, 1398 ret = nfs_may_open(ctx->dentry->d_inode,
1380 ctx->cred, 1399 ctx->cred,
1381 nd->intent.open.flags); 1400 nd->intent.open.flags);
1382 if (ret < 0) 1401 if (ret < 0)
1383 goto out; 1402 goto out;
1384 } 1403 }
1385 filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open); 1404 filp = lookup_instantiate_filp(nd, ctx->dentry, do_open);
1386 if (IS_ERR(filp)) 1405 if (IS_ERR(filp))
1387 ret = PTR_ERR(filp); 1406 ret = PTR_ERR(filp);
1388 else 1407 else
@@ -1420,12 +1439,13 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1420 goto out; 1439 goto out;
1421 } 1440 }
1422 1441
1423 ctx = nameidata_to_nfs_open_context(dentry, nd); 1442 open_flags = nd->intent.open.flags;
1443
1444 ctx = create_nfs_open_context(dentry, open_flags);
1424 res = ERR_CAST(ctx); 1445 res = ERR_CAST(ctx);
1425 if (IS_ERR(ctx)) 1446 if (IS_ERR(ctx))
1426 goto out; 1447 goto out;
1427 1448
1428 open_flags = nd->intent.open.flags;
1429 if (nd->flags & LOOKUP_CREATE) { 1449 if (nd->flags & LOOKUP_CREATE) {
1430 attr.ia_mode = nd->intent.open.create_mode; 1450 attr.ia_mode = nd->intent.open.create_mode;
1431 attr.ia_valid = ATTR_MODE; 1451 attr.ia_valid = ATTR_MODE;
@@ -1448,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1448 res = NULL; 1468 res = NULL;
1449 goto out; 1469 goto out;
1450 /* This turned out not to be a regular file */ 1470 /* This turned out not to be a regular file */
1471 case -EISDIR:
1451 case -ENOTDIR: 1472 case -ENOTDIR:
1452 goto no_open; 1473 goto no_open;
1453 case -ELOOP: 1474 case -ELOOP:
1454 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1475 if (!(nd->intent.open.flags & O_NOFOLLOW))
1455 goto no_open; 1476 goto no_open;
1456 /* case -EISDIR: */
1457 /* case -EINVAL: */ 1477 /* case -EINVAL: */
1458 default: 1478 default:
1459 res = ERR_CAST(inode); 1479 res = ERR_CAST(inode);
@@ -1463,8 +1483,8 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1463 res = d_add_unique(dentry, inode); 1483 res = d_add_unique(dentry, inode);
1464 nfs_unblock_sillyrename(dentry->d_parent); 1484 nfs_unblock_sillyrename(dentry->d_parent);
1465 if (res != NULL) { 1485 if (res != NULL) {
1466 dput(ctx->path.dentry); 1486 dput(ctx->dentry);
1467 ctx->path.dentry = dget(res); 1487 ctx->dentry = dget(res);
1468 dentry = res; 1488 dentry = res;
1469 } 1489 }
1470 err = nfs_intent_set_file(nd, ctx); 1490 err = nfs_intent_set_file(nd, ctx);
@@ -1517,7 +1537,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1517 /* We can't create new files, or truncate existing ones here */ 1537 /* We can't create new files, or truncate existing ones here */
1518 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); 1538 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
1519 1539
1520 ctx = nameidata_to_nfs_open_context(dentry, nd); 1540 ctx = create_nfs_open_context(dentry, openflags);
1521 ret = PTR_ERR(ctx); 1541 ret = PTR_ERR(ctx);
1522 if (IS_ERR(ctx)) 1542 if (IS_ERR(ctx))
1523 goto out; 1543 goto out;
@@ -1570,7 +1590,7 @@ static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
1570 struct nfs_open_context *ctx = NULL; 1590 struct nfs_open_context *ctx = NULL;
1571 struct iattr attr; 1591 struct iattr attr;
1572 int error; 1592 int error;
1573 int open_flags = 0; 1593 int open_flags = O_CREAT|O_EXCL;
1574 1594
1575 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1595 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1576 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1596 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1578,27 +1598,27 @@ static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode,
1578 attr.ia_mode = mode; 1598 attr.ia_mode = mode;
1579 attr.ia_valid = ATTR_MODE; 1599 attr.ia_valid = ATTR_MODE;
1580 1600
1581 if ((nd->flags & LOOKUP_CREATE) != 0) { 1601 if (nd)
1582 open_flags = nd->intent.open.flags; 1602 open_flags = nd->intent.open.flags;
1583 1603
1584 ctx = nameidata_to_nfs_open_context(dentry, nd); 1604 ctx = create_nfs_open_context(dentry, open_flags);
1585 error = PTR_ERR(ctx); 1605 error = PTR_ERR(ctx);
1586 if (IS_ERR(ctx)) 1606 if (IS_ERR(ctx))
1587 goto out_err_drop; 1607 goto out_err_drop;
1588 }
1589 1608
1590 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); 1609 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
1591 if (error != 0) 1610 if (error != 0)
1592 goto out_put_ctx; 1611 goto out_put_ctx;
1593 if (ctx != NULL) { 1612 if (nd) {
1594 error = nfs_intent_set_file(nd, ctx); 1613 error = nfs_intent_set_file(nd, ctx);
1595 if (error < 0) 1614 if (error < 0)
1596 goto out_err; 1615 goto out_err;
1616 } else {
1617 put_nfs_open_context(ctx);
1597 } 1618 }
1598 return 0; 1619 return 0;
1599out_put_ctx: 1620out_put_ctx:
1600 if (ctx != NULL) 1621 put_nfs_open_context(ctx);
1601 put_nfs_open_context(ctx);
1602out_err_drop: 1622out_err_drop:
1603 d_drop(dentry); 1623 d_drop(dentry);
1604out_err: 1624out_err:
@@ -1660,7 +1680,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1660{ 1680{
1661 struct iattr attr; 1681 struct iattr attr;
1662 int error; 1682 int error;
1663 int open_flags = 0; 1683 int open_flags = O_CREAT|O_EXCL;
1664 1684
1665 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1685 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1666 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1686 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1668,7 +1688,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1668 attr.ia_mode = mode; 1688 attr.ia_mode = mode;
1669 attr.ia_valid = ATTR_MODE; 1689 attr.ia_valid = ATTR_MODE;
1670 1690
1671 if ((nd->flags & LOOKUP_CREATE) != 0) 1691 if (nd)
1672 open_flags = nd->intent.open.flags; 1692 open_flags = nd->intent.open.flags;
1673 1693
1674 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); 1694 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
@@ -2259,11 +2279,11 @@ static int nfs_open_permission_mask(int openflags)
2259{ 2279{
2260 int mask = 0; 2280 int mask = 0;
2261 2281
2262 if (openflags & FMODE_READ) 2282 if ((openflags & O_ACCMODE) != O_WRONLY)
2263 mask |= MAY_READ; 2283 mask |= MAY_READ;
2264 if (openflags & FMODE_WRITE) 2284 if ((openflags & O_ACCMODE) != O_RDONLY)
2265 mask |= MAY_WRITE; 2285 mask |= MAY_WRITE;
2266 if (openflags & FMODE_EXEC) 2286 if (openflags & __FMODE_EXEC)
2267 mask |= MAY_EXEC; 2287 mask |= MAY_EXEC;
2268 return mask; 2288 return mask;
2269} 2289}
@@ -2273,12 +2293,12 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
2273 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); 2293 return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
2274} 2294}
2275 2295
2276int nfs_permission(struct inode *inode, int mask, unsigned int flags) 2296int nfs_permission(struct inode *inode, int mask)
2277{ 2297{
2278 struct rpc_cred *cred; 2298 struct rpc_cred *cred;
2279 int res = 0; 2299 int res = 0;
2280 2300
2281 if (flags & IPERM_FLAG_RCU) 2301 if (mask & MAY_NOT_BLOCK)
2282 return -ECHILD; 2302 return -ECHILD;
2283 2303
2284 nfs_inc_stats(inode, NFSIOS_VFSACCESS); 2304 nfs_inc_stats(inode, NFSIOS_VFSACCESS);
@@ -2328,7 +2348,7 @@ out:
2328out_notsup: 2348out_notsup:
2329 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 2349 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
2330 if (res == 0) 2350 if (res == 0)
2331 res = generic_permission(inode, mask, flags, NULL); 2351 res = generic_permission(inode, mask);
2332 goto out; 2352 goto out;
2333} 2353}
2334 2354
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8eea2536671..1940f1a56a5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -53,7 +53,7 @@
53 53
54#include <asm/system.h> 54#include <asm/system.h>
55#include <asm/uaccess.h> 55#include <asm/uaccess.h>
56#include <asm/atomic.h> 56#include <linux/atomic.h>
57 57
58#include "internal.h" 58#include "internal.h"
59#include "iostat.h" 59#include "iostat.h"
@@ -284,7 +284,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
284 loff_t pos) 284 loff_t pos)
285{ 285{
286 struct nfs_open_context *ctx = dreq->ctx; 286 struct nfs_open_context *ctx = dreq->ctx;
287 struct inode *inode = ctx->path.dentry->d_inode; 287 struct inode *inode = ctx->dentry->d_inode;
288 unsigned long user_addr = (unsigned long)iov->iov_base; 288 unsigned long user_addr = (unsigned long)iov->iov_base;
289 size_t count = iov->iov_len; 289 size_t count = iov->iov_len;
290 size_t rsize = NFS_SERVER(inode)->rsize; 290 size_t rsize = NFS_SERVER(inode)->rsize;
@@ -715,7 +715,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
715 loff_t pos, int sync) 715 loff_t pos, int sync)
716{ 716{
717 struct nfs_open_context *ctx = dreq->ctx; 717 struct nfs_open_context *ctx = dreq->ctx;
718 struct inode *inode = ctx->path.dentry->d_inode; 718 struct inode *inode = ctx->dentry->d_inode;
719 unsigned long user_addr = (unsigned long)iov->iov_base; 719 unsigned long user_addr = (unsigned long)iov->iov_base;
720 size_t count = iov->iov_len; 720 size_t count = iov->iov_len;
721 struct rpc_task *task; 721 struct rpc_task *task;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2f093ed1698..b76be2fb573 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -55,7 +55,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
55static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 55static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
56 unsigned long nr_segs, loff_t pos); 56 unsigned long nr_segs, loff_t pos);
57static int nfs_file_flush(struct file *, fl_owner_t id); 57static int nfs_file_flush(struct file *, fl_owner_t id);
58static int nfs_file_fsync(struct file *, int datasync); 58static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
59static int nfs_check_flags(int flags); 59static int nfs_check_flags(int flags);
60static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 60static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
61static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 61static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -187,8 +187,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
187 filp->f_path.dentry->d_name.name, 187 filp->f_path.dentry->d_name.name,
188 offset, origin); 188 offset, origin);
189 189
190 /* origin == SEEK_END => we must revalidate the cached file length */ 190 /*
191 if (origin == SEEK_END) { 191 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
192 * the cached file length
193 */
194 if (origin != SEEK_SET && origin != SEEK_CUR) {
192 struct inode *inode = filp->f_mapping->host; 195 struct inode *inode = filp->f_mapping->host;
193 196
194 int retval = nfs_revalidate_file_size(inode, filp); 197 int retval = nfs_revalidate_file_size(inode, filp);
@@ -305,7 +308,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
305 * fall back to doing a synchronous write. 308 * fall back to doing a synchronous write.
306 */ 309 */
307static int 310static int
308nfs_file_fsync(struct file *file, int datasync) 311nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
309{ 312{
310 struct dentry *dentry = file->f_path.dentry; 313 struct dentry *dentry = file->f_path.dentry;
311 struct nfs_open_context *ctx = nfs_file_open_context(file); 314 struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -313,14 +316,18 @@ nfs_file_fsync(struct file *file, int datasync)
313 int have_error, status; 316 int have_error, status;
314 int ret = 0; 317 int ret = 0;
315 318
316
317 dprintk("NFS: fsync file(%s/%s) datasync %d\n", 319 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
318 dentry->d_parent->d_name.name, dentry->d_name.name, 320 dentry->d_parent->d_name.name, dentry->d_name.name,
319 datasync); 321 datasync);
320 322
323 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
324 mutex_lock(&inode->i_mutex);
325
321 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 326 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
322 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 327 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
323 status = nfs_commit_inode(inode, FLUSH_SYNC); 328 status = nfs_commit_inode(inode, FLUSH_SYNC);
329 if (status >= 0 && ret < 0)
330 status = ret;
324 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 331 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
325 if (have_error) 332 if (have_error)
326 ret = xchg(&ctx->error, 0); 333 ret = xchg(&ctx->error, 0);
@@ -329,6 +336,7 @@ nfs_file_fsync(struct file *file, int datasync)
329 if (!ret && !datasync) 336 if (!ret && !datasync)
330 /* application has asked for meta-data sync */ 337 /* application has asked for meta-data sync */
331 ret = pnfs_layoutcommit_inode(inode, true); 338 ret = pnfs_layoutcommit_inode(inode, true);
339 mutex_unlock(&inode->i_mutex);
332 return ret; 340 return ret;
333} 341}
334 342
@@ -887,3 +895,35 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
887 file->f_path.dentry->d_name.name, arg); 895 file->f_path.dentry->d_name.name, arg);
888 return -EINVAL; 896 return -EINVAL;
889} 897}
898
899#ifdef CONFIG_NFS_V4
900static int
901nfs4_file_open(struct inode *inode, struct file *filp)
902{
903 /*
904 * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
905 * this point, then something is very wrong
906 */
907 dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
908 return -ENOTDIR;
909}
910
911const struct file_operations nfs4_file_operations = {
912 .llseek = nfs_file_llseek,
913 .read = do_sync_read,
914 .write = do_sync_write,
915 .aio_read = nfs_file_read,
916 .aio_write = nfs_file_write,
917 .mmap = nfs_file_mmap,
918 .open = nfs4_file_open,
919 .flush = nfs_file_flush,
920 .release = nfs_file_release,
921 .fsync = nfs_file_fsync,
922 .lock = nfs_lock,
923 .flock = nfs_flock,
924 .splice_read = nfs_file_splice_read,
925 .splice_write = nfs_file_splice_write,
926 .check_flags = nfs_check_flags,
927 .setlease = nfs_setlease,
928};
929#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 79664a1025a..47d1c6ff2d8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -36,6 +36,8 @@
36#include <linux/types.h> 36#include <linux/types.h>
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/kernel.h> 38#include <linux/kernel.h>
39#include <linux/slab.h>
40#include <linux/nfs_idmap.h>
39 41
40static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) 42static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
41{ 43{
@@ -59,12 +61,10 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
59 61
60#ifdef CONFIG_NFS_USE_NEW_IDMAPPER 62#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
61 63
62#include <linux/slab.h>
63#include <linux/cred.h> 64#include <linux/cred.h>
64#include <linux/sunrpc/sched.h> 65#include <linux/sunrpc/sched.h>
65#include <linux/nfs4.h> 66#include <linux/nfs4.h>
66#include <linux/nfs_fs_sb.h> 67#include <linux/nfs_fs_sb.h>
67#include <linux/nfs_idmap.h>
68#include <linux/keyctl.h> 68#include <linux/keyctl.h>
69#include <linux/key-type.h> 69#include <linux/key-type.h>
70#include <linux/rcupdate.h> 70#include <linux/rcupdate.h>
@@ -284,18 +284,15 @@ int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf,
284#include <linux/module.h> 284#include <linux/module.h>
285#include <linux/mutex.h> 285#include <linux/mutex.h>
286#include <linux/init.h> 286#include <linux/init.h>
287#include <linux/slab.h>
288#include <linux/socket.h> 287#include <linux/socket.h>
289#include <linux/in.h> 288#include <linux/in.h>
290#include <linux/sched.h> 289#include <linux/sched.h>
291
292#include <linux/sunrpc/clnt.h> 290#include <linux/sunrpc/clnt.h>
293#include <linux/workqueue.h> 291#include <linux/workqueue.h>
294#include <linux/sunrpc/rpc_pipe_fs.h> 292#include <linux/sunrpc/rpc_pipe_fs.h>
295 293
296#include <linux/nfs_fs.h> 294#include <linux/nfs_fs.h>
297 295
298#include <linux/nfs_idmap.h>
299#include "nfs4_fs.h" 296#include "nfs4_fs.h"
300 297
301#define IDMAP_HASH_SZ 128 298#define IDMAP_HASH_SZ 128
@@ -339,8 +336,6 @@ struct idmap {
339 struct idmap_hashtable idmap_group_hash; 336 struct idmap_hashtable idmap_group_hash;
340}; 337};
341 338
342static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
343 char __user *, size_t);
344static ssize_t idmap_pipe_downcall(struct file *, const char __user *, 339static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
345 size_t); 340 size_t);
346static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); 341static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -348,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
348static unsigned int fnvhash32(const void *, size_t); 343static unsigned int fnvhash32(const void *, size_t);
349 344
350static const struct rpc_pipe_ops idmap_upcall_ops = { 345static const struct rpc_pipe_ops idmap_upcall_ops = {
351 .upcall = idmap_pipe_upcall, 346 .upcall = rpc_pipe_generic_upcall,
352 .downcall = idmap_pipe_downcall, 347 .downcall = idmap_pipe_downcall,
353 .destroy_msg = idmap_pipe_destroy_msg, 348 .destroy_msg = idmap_pipe_destroy_msg,
354}; 349};
@@ -598,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
598 return ret; 593 return ret;
599} 594}
600 595
601/* RPC pipefs upcall/downcall routines */
602static ssize_t
603idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
604 char __user *dst, size_t buflen)
605{
606 char *data = (char *)msg->data + msg->copied;
607 size_t mlen = min(msg->len, buflen);
608 unsigned long left;
609
610 left = copy_to_user(dst, data, mlen);
611 if (left == mlen) {
612 msg->errno = -EFAULT;
613 return -EFAULT;
614 }
615
616 mlen -= left;
617 msg->copied += mlen;
618 msg->errno = 0;
619 return mlen;
620}
621
622static ssize_t 596static ssize_t
623idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) 597idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
624{ 598{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6f4850deb27..679d2f50b14 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 */ 291 */
292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; 292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
293 if (S_ISREG(inode->i_mode)) { 293 if (S_ISREG(inode->i_mode)) {
294 inode->i_fop = &nfs_file_operations; 294 inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
295 inode->i_data.a_ops = &nfs_file_aops; 295 inode->i_data.a_ops = &nfs_file_aops;
296 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; 296 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
297 } else if (S_ISDIR(inode->i_mode)) { 297 } else if (S_ISDIR(inode->i_mode)) {
@@ -567,7 +567,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
567struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) 567struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
568{ 568{
569 struct nfs_lock_context *res, *new = NULL; 569 struct nfs_lock_context *res, *new = NULL;
570 struct inode *inode = ctx->path.dentry->d_inode; 570 struct inode *inode = ctx->dentry->d_inode;
571 571
572 spin_lock(&inode->i_lock); 572 spin_lock(&inode->i_lock);
573 res = __nfs_find_lock_context(ctx); 573 res = __nfs_find_lock_context(ctx);
@@ -594,7 +594,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
594void nfs_put_lock_context(struct nfs_lock_context *l_ctx) 594void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
595{ 595{
596 struct nfs_open_context *ctx = l_ctx->open_context; 596 struct nfs_open_context *ctx = l_ctx->open_context;
597 struct inode *inode = ctx->path.dentry->d_inode; 597 struct inode *inode = ctx->dentry->d_inode;
598 598
599 if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) 599 if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
600 return; 600 return;
@@ -620,7 +620,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
620 return; 620 return;
621 if (!is_sync) 621 if (!is_sync)
622 return; 622 return;
623 inode = ctx->path.dentry->d_inode; 623 inode = ctx->dentry->d_inode;
624 if (!list_empty(&NFS_I(inode)->open_files)) 624 if (!list_empty(&NFS_I(inode)->open_files))
625 return; 625 return;
626 server = NFS_SERVER(inode); 626 server = NFS_SERVER(inode);
@@ -629,14 +629,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
629 nfs_revalidate_inode(server, inode); 629 nfs_revalidate_inode(server, inode);
630} 630}
631 631
632struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode) 632struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode)
633{ 633{
634 struct nfs_open_context *ctx; 634 struct nfs_open_context *ctx;
635 635
636 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 636 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
637 if (ctx != NULL) { 637 if (ctx != NULL) {
638 ctx->path = *path; 638 nfs_sb_active(dentry->d_sb);
639 path_get(&ctx->path); 639 ctx->dentry = dget(dentry);
640 ctx->cred = get_rpccred(cred); 640 ctx->cred = get_rpccred(cred);
641 ctx->state = NULL; 641 ctx->state = NULL;
642 ctx->mode = f_mode; 642 ctx->mode = f_mode;
@@ -658,7 +658,8 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
658 658
659static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) 659static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
660{ 660{
661 struct inode *inode = ctx->path.dentry->d_inode; 661 struct inode *inode = ctx->dentry->d_inode;
662 struct super_block *sb = ctx->dentry->d_sb;
662 663
663 if (!list_empty(&ctx->list)) { 664 if (!list_empty(&ctx->list)) {
664 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) 665 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
@@ -671,7 +672,8 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
671 NFS_PROTO(inode)->close_context(ctx, is_sync); 672 NFS_PROTO(inode)->close_context(ctx, is_sync);
672 if (ctx->cred != NULL) 673 if (ctx->cred != NULL)
673 put_rpccred(ctx->cred); 674 put_rpccred(ctx->cred);
674 path_put(&ctx->path); 675 dput(ctx->dentry);
676 nfs_sb_deactive(sb);
675 kfree(ctx); 677 kfree(ctx);
676} 678}
677 679
@@ -741,7 +743,7 @@ int nfs_open(struct inode *inode, struct file *filp)
741 cred = rpc_lookup_cred(); 743 cred = rpc_lookup_cred();
742 if (IS_ERR(cred)) 744 if (IS_ERR(cred))
743 return PTR_ERR(cred); 745 return PTR_ERR(cred);
744 ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode); 746 ctx = alloc_nfs_open_context(filp->f_path.dentry, cred, filp->f_mode);
745 put_rpccred(cred); 747 put_rpccred(cred);
746 if (ctx == NULL) 748 if (ctx == NULL)
747 return -ENOMEM; 749 return -ENOMEM;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a55347a2da..ab12913dd47 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -277,6 +277,9 @@ extern void nfs_sb_deactive(struct super_block *sb);
277extern char *nfs_path(char **p, struct dentry *dentry, 277extern char *nfs_path(char **p, struct dentry *dentry,
278 char *buffer, ssize_t buflen); 278 char *buffer, ssize_t buflen);
279extern struct vfsmount *nfs_d_automount(struct path *path); 279extern struct vfsmount *nfs_d_automount(struct path *path);
280#ifdef CONFIG_NFS_V4
281rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
282#endif
280 283
281/* getroot.c */ 284/* getroot.c */
282extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, 285extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
@@ -288,12 +291,22 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
288extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); 291extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
289#endif 292#endif
290 293
294struct nfs_pageio_descriptor;
291/* read.c */ 295/* read.c */
292extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, 296extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
293 const struct rpc_call_ops *call_ops); 297 const struct rpc_call_ops *call_ops);
294extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 298extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
299extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
300 struct list_head *head);
301
302extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
303extern void nfs_readdata_release(struct nfs_read_data *rdata);
295 304
296/* write.c */ 305/* write.c */
306extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
307 struct list_head *head);
308extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
309extern void nfs_writedata_release(struct nfs_write_data *wdata);
297extern void nfs_commit_free(struct nfs_write_data *p); 310extern void nfs_commit_free(struct nfs_write_data *p);
298extern int nfs_initiate_write(struct nfs_write_data *data, 311extern int nfs_initiate_write(struct nfs_write_data *data,
299 struct rpc_clnt *clnt, 312 struct rpc_clnt *clnt,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 1f063bacd28..8102391bb37 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -119,7 +119,7 @@ Elong:
119} 119}
120 120
121#ifdef CONFIG_NFS_V4 121#ifdef CONFIG_NFS_V4
122static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) 122rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
123{ 123{
124 struct gss_api_mech *mech; 124 struct gss_api_mech *mech;
125 struct xdr_netobj oid; 125 struct xdr_netobj oid;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 27434277165..7ef23979896 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -415,7 +415,7 @@ fail:
415} 415}
416 416
417int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, 417int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
418 mode_t mode) 418 umode_t mode)
419{ 419{
420 struct posix_acl *dfacl, *acl; 420 struct posix_acl *dfacl, *acl;
421 int error = 0; 421 int error = 0;
@@ -427,16 +427,12 @@ int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
427 } 427 }
428 if (!dfacl) 428 if (!dfacl)
429 return 0; 429 return 0;
430 acl = posix_acl_clone(dfacl, GFP_KERNEL); 430 acl = posix_acl_dup(dfacl);
431 error = -ENOMEM; 431 error = posix_acl_create(&acl, GFP_KERNEL, &mode);
432 if (!acl)
433 goto out_release_dfacl;
434 error = posix_acl_create_masq(acl, &mode);
435 if (error < 0) 432 if (error < 0)
436 goto out_release_acl; 433 goto out_release_dfacl;
437 error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? 434 error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
438 dfacl : NULL); 435 dfacl : NULL);
439out_release_acl:
440 posix_acl_release(acl); 436 posix_acl_release(acl);
441out_release_dfacl: 437out_release_dfacl:
442 posix_acl_release(dfacl); 438 posix_acl_release(dfacl);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 38053d823eb..d4bc9ed9174 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
316 int flags, struct nfs_open_context *ctx) 316 int flags, struct nfs_open_context *ctx)
317{ 317{
318 struct nfs3_createdata *data; 318 struct nfs3_createdata *data;
319 mode_t mode = sattr->ia_mode; 319 umode_t mode = sattr->ia_mode;
320 int status = -ENOMEM; 320 int status = -ENOMEM;
321 321
322 dprintk("NFS call create %s\n", dentry->d_name.name); 322 dprintk("NFS call create %s\n", dentry->d_name.name);
@@ -562,7 +562,7 @@ static int
562nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 562nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
563{ 563{
564 struct nfs3_createdata *data; 564 struct nfs3_createdata *data;
565 int mode = sattr->ia_mode; 565 umode_t mode = sattr->ia_mode;
566 int status = -ENOMEM; 566 int status = -ENOMEM;
567 567
568 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 568 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
@@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
681 dev_t rdev) 681 dev_t rdev)
682{ 682{
683 struct nfs3_createdata *data; 683 struct nfs3_createdata *data;
684 mode_t mode = sattr->ia_mode; 684 umode_t mode = sattr->ia_mode;
685 int status = -ENOMEM; 685 int status = -ENOMEM;
686 686
687 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 687 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
853 .dentry_ops = &nfs_dentry_operations, 853 .dentry_ops = &nfs_dentry_operations,
854 .dir_inode_ops = &nfs3_dir_inode_operations, 854 .dir_inode_ops = &nfs3_dir_inode_operations,
855 .file_inode_ops = &nfs3_file_inode_operations, 855 .file_inode_ops = &nfs3_file_inode_operations,
856 .file_ops = &nfs_file_operations,
856 .getroot = nfs3_proc_get_root, 857 .getroot = nfs3_proc_get_root,
857 .getattr = nfs3_proc_getattr, 858 .getattr = nfs3_proc_getattr,
858 .setattr = nfs3_proc_setattr, 859 .setattr = nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c4a69833dd0..3e93e9a1bee 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -48,6 +48,7 @@ enum nfs4_client_state {
48 NFS4CLNT_SESSION_RESET, 48 NFS4CLNT_SESSION_RESET,
49 NFS4CLNT_RECALL_SLOT, 49 NFS4CLNT_RECALL_SLOT,
50 NFS4CLNT_LEASE_CONFIRM, 50 NFS4CLNT_LEASE_CONFIRM,
51 NFS4CLNT_SERVER_SCOPE_MISMATCH,
51}; 52};
52 53
53enum nfs4_session_state { 54enum nfs4_session_state {
@@ -55,6 +56,9 @@ enum nfs4_session_state {
55 NFS4_SESSION_DRAINING, 56 NFS4_SESSION_DRAINING,
56}; 57};
57 58
59#define NFS4_RENEW_TIMEOUT 0x01
60#define NFS4_RENEW_DELEGATION_CB 0x02
61
58struct nfs4_minor_version_ops { 62struct nfs4_minor_version_ops {
59 u32 minor_version; 63 u32 minor_version;
60 64
@@ -66,6 +70,8 @@ struct nfs4_minor_version_ops {
66 int cache_reply); 70 int cache_reply);
67 int (*validate_stateid)(struct nfs_delegation *, 71 int (*validate_stateid)(struct nfs_delegation *,
68 const nfs4_stateid *); 72 const nfs4_stateid *);
73 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
74 struct nfs_fsinfo *);
69 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 75 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
70 const struct nfs4_state_recovery_ops *nograce_recovery_ops; 76 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
71 const struct nfs4_state_maintenance_ops *state_renewal_ops; 77 const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -222,7 +228,7 @@ struct nfs4_state_recovery_ops {
222}; 228};
223 229
224struct nfs4_state_maintenance_ops { 230struct nfs4_state_maintenance_ops {
225 int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *); 231 int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
226 struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); 232 struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
227 int (*renew_lease)(struct nfs_client *, struct rpc_cred *); 233 int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
228}; 234};
@@ -234,11 +240,9 @@ extern const struct inode_operations nfs4_dir_inode_operations;
234extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 240extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
235extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 241extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
236extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); 242extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
237extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
238extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
239extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 243extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
240extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 244extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
241extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); 245extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
242extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 246extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
243extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 247extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
244 struct nfs4_fs_locations *fs_locations, struct page *page); 248 struct nfs4_fs_locations *fs_locations, struct page *page);
@@ -315,7 +319,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
315extern const u32 nfs4_fattr_bitmap[2]; 319extern const u32 nfs4_fattr_bitmap[2];
316extern const u32 nfs4_statfs_bitmap[2]; 320extern const u32 nfs4_statfs_bitmap[2];
317extern const u32 nfs4_pathconf_bitmap[2]; 321extern const u32 nfs4_pathconf_bitmap[2];
318extern const u32 nfs4_fsinfo_bitmap[2]; 322extern const u32 nfs4_fsinfo_bitmap[3];
319extern const u32 nfs4_fs_locations_bitmap[2]; 323extern const u32 nfs4_fs_locations_bitmap[2];
320 324
321/* nfs4renewd.c */ 325/* nfs4renewd.c */
@@ -341,14 +345,17 @@ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struc
341extern void nfs4_put_state_owner(struct nfs4_state_owner *); 345extern void nfs4_put_state_owner(struct nfs4_state_owner *);
342extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); 346extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
343extern void nfs4_put_open_state(struct nfs4_state *); 347extern void nfs4_put_open_state(struct nfs4_state *);
344extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); 348extern void nfs4_close_state(struct nfs4_state *, fmode_t);
345extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); 349extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
346extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 350extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
347extern void nfs4_schedule_lease_recovery(struct nfs_client *); 351extern void nfs4_schedule_lease_recovery(struct nfs_client *);
348extern void nfs4_schedule_state_manager(struct nfs_client *); 352extern void nfs4_schedule_state_manager(struct nfs_client *);
353extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
349extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); 354extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
350extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 355extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
351extern void nfs41_handle_recall_slot(struct nfs_client *clp); 356extern void nfs41_handle_recall_slot(struct nfs_client *clp);
357extern void nfs41_handle_server_scope(struct nfs_client *,
358 struct server_scope **);
352extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 359extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
353extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 360extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
354extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); 361extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
@@ -373,8 +380,8 @@ extern struct svc_version nfs4_callback_version4;
373 380
374#else 381#else
375 382
376#define nfs4_close_state(a, b, c) do { } while (0) 383#define nfs4_close_state(a, b) do { } while (0)
377#define nfs4_close_sync(a, b, c) do { } while (0) 384#define nfs4_close_sync(a, b) do { } while (0)
378 385
379#endif /* CONFIG_NFS_V4 */ 386#endif /* CONFIG_NFS_V4 */
380#endif /* __LINUX_FS_NFS_NFS4_FS.H */ 387#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index f9d03abcd04..4c78c62639e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
77 BUG(); 77 BUG();
78} 78}
79 79
80/* For data server errors we don't recover from */
81static void
82filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
83{
84 if (lseg->pls_range.iomode == IOMODE_RW) {
85 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
86 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
87 } else {
88 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
89 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
90 }
91}
92
93static int filelayout_async_handle_error(struct rpc_task *task, 80static int filelayout_async_handle_error(struct rpc_task *task,
94 struct nfs4_state *state, 81 struct nfs4_state *state,
95 struct nfs_client *clp, 82 struct nfs_client *clp,
@@ -145,7 +132,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
145 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", 132 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
146 __func__, data->ds_clp, data->ds_clp->cl_session); 133 __func__, data->ds_clp, data->ds_clp->cl_session);
147 if (reset) { 134 if (reset) {
148 filelayout_set_lo_fail(data->lseg); 135 pnfs_set_lo_fail(data->lseg);
149 nfs4_reset_read(task, data); 136 nfs4_reset_read(task, data);
150 clp = NFS_SERVER(data->inode)->nfs_client; 137 clp = NFS_SERVER(data->inode)->nfs_client;
151 } 138 }
@@ -170,7 +157,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
170 157
171 pnfs_set_layoutcommit(wdata); 158 pnfs_set_layoutcommit(wdata);
172 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, 159 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
173 (unsigned long) wdata->lseg->pls_end_pos); 160 (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
174} 161}
175 162
176/* 163/*
@@ -221,7 +208,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
221 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", 208 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
222 __func__, data->ds_clp, data->ds_clp->cl_session); 209 __func__, data->ds_clp, data->ds_clp->cl_session);
223 if (reset) { 210 if (reset) {
224 filelayout_set_lo_fail(data->lseg); 211 pnfs_set_lo_fail(data->lseg);
225 nfs4_reset_write(task, data); 212 nfs4_reset_write(task, data);
226 clp = NFS_SERVER(data->inode)->nfs_client; 213 clp = NFS_SERVER(data->inode)->nfs_client;
227 } else 214 } else
@@ -256,7 +243,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
256 __func__, data->ds_clp, data->ds_clp->cl_session); 243 __func__, data->ds_clp, data->ds_clp->cl_session);
257 if (reset) { 244 if (reset) {
258 prepare_to_resend_writes(data); 245 prepare_to_resend_writes(data);
259 filelayout_set_lo_fail(data->lseg); 246 pnfs_set_lo_fail(data->lseg);
260 } else 247 } else
261 nfs_restart_rpc(task, data->ds_clp); 248 nfs_restart_rpc(task, data->ds_clp);
262 return -EAGAIN; 249 return -EAGAIN;
@@ -334,6 +321,9 @@ filelayout_read_pagelist(struct nfs_read_data *data)
334 __func__, data->inode->i_ino, 321 __func__, data->inode->i_ino,
335 data->args.pgbase, (size_t)data->args.count, offset); 322 data->args.pgbase, (size_t)data->args.count, offset);
336 323
324 if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
325 return PNFS_NOT_ATTEMPTED;
326
337 /* Retrieve the correct rpc_client for the byte range */ 327 /* Retrieve the correct rpc_client for the byte range */
338 j = nfs4_fl_calc_j_index(lseg, offset); 328 j = nfs4_fl_calc_j_index(lseg, offset);
339 idx = nfs4_fl_calc_ds_index(lseg, j); 329 idx = nfs4_fl_calc_ds_index(lseg, j);
@@ -344,8 +334,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
344 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 334 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
345 return PNFS_NOT_ATTEMPTED; 335 return PNFS_NOT_ATTEMPTED;
346 } 336 }
347 dprintk("%s USE DS:ip %x %hu\n", __func__, 337 dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr);
348 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
349 338
350 /* No multipath support. Use first DS */ 339 /* No multipath support. Use first DS */
351 data->ds_clp = ds->ds_clp; 340 data->ds_clp = ds->ds_clp;
@@ -374,6 +363,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
374 struct nfs_fh *fh; 363 struct nfs_fh *fh;
375 int status; 364 int status;
376 365
366 if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
367 return PNFS_NOT_ATTEMPTED;
368
377 /* Retrieve the correct rpc_client for the byte range */ 369 /* Retrieve the correct rpc_client for the byte range */
378 j = nfs4_fl_calc_j_index(lseg, offset); 370 j = nfs4_fl_calc_j_index(lseg, offset);
379 idx = nfs4_fl_calc_ds_index(lseg, j); 371 idx = nfs4_fl_calc_ds_index(lseg, j);
@@ -384,9 +376,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
384 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 376 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
385 return PNFS_NOT_ATTEMPTED; 377 return PNFS_NOT_ATTEMPTED;
386 } 378 }
387 dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, 379 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,
388 data->inode->i_ino, sync, (size_t) data->args.count, offset, 380 data->inode->i_ino, sync, (size_t) data->args.count, offset,
389 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); 381 ds->ds_remotestr);
390 382
391 data->write_done_cb = filelayout_write_done_cb; 383 data->write_done_cb = filelayout_write_done_cb;
392 data->ds_clp = ds->ds_clp; 384 data->ds_clp = ds->ds_clp;
@@ -428,6 +420,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
428 420
429 dprintk("--> %s\n", __func__); 421 dprintk("--> %s\n", __func__);
430 422
423 /* FIXME: remove this check when layout segment support is added */
424 if (lgr->range.offset != 0 ||
425 lgr->range.length != NFS4_MAX_UINT64) {
426 dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
427 __func__);
428 goto out;
429 }
430
431 if (fl->pattern_offset > lgr->range.offset) { 431 if (fl->pattern_offset > lgr->range.offset) {
432 dprintk("%s pattern_offset %lld too large\n", 432 dprintk("%s pattern_offset %lld too large\n",
433 __func__, fl->pattern_offset); 433 __func__, fl->pattern_offset);
@@ -449,6 +449,10 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
449 goto out; 449 goto out;
450 } else 450 } else
451 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); 451 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
452 /* Found deviceid is being reaped */
453 if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags))
454 goto out_put;
455
452 fl->dsaddr = dsaddr; 456 fl->dsaddr = dsaddr;
453 457
454 if (fl->first_stripe_index < 0 || 458 if (fl->first_stripe_index < 0 ||
@@ -659,7 +663,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
659 * return true : coalesce page 663 * return true : coalesce page
660 * return false : don't coalesce page 664 * return false : don't coalesce page
661 */ 665 */
662bool 666static bool
663filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 667filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
664 struct nfs_page *req) 668 struct nfs_page *req)
665{ 669{
@@ -670,8 +674,6 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
670 !nfs_generic_pg_test(pgio, prev, req)) 674 !nfs_generic_pg_test(pgio, prev, req))
671 return false; 675 return false;
672 676
673 if (!pgio->pg_lseg)
674 return 1;
675 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; 677 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
676 r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; 678 r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
677 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; 679 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
@@ -682,6 +684,52 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
682 return (p_stripe == r_stripe); 684 return (p_stripe == r_stripe);
683} 685}
684 686
687void
688filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
689 struct nfs_page *req)
690{
691 BUG_ON(pgio->pg_lseg != NULL);
692
693 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
694 req->wb_context,
695 0,
696 NFS4_MAX_UINT64,
697 IOMODE_READ,
698 GFP_KERNEL);
699 /* If no lseg, fall back to read through mds */
700 if (pgio->pg_lseg == NULL)
701 nfs_pageio_reset_read_mds(pgio);
702}
703
704void
705filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
706 struct nfs_page *req)
707{
708 BUG_ON(pgio->pg_lseg != NULL);
709
710 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
711 req->wb_context,
712 0,
713 NFS4_MAX_UINT64,
714 IOMODE_RW,
715 GFP_NOFS);
716 /* If no lseg, fall back to write through mds */
717 if (pgio->pg_lseg == NULL)
718 nfs_pageio_reset_write_mds(pgio);
719}
720
721static const struct nfs_pageio_ops filelayout_pg_read_ops = {
722 .pg_init = filelayout_pg_init_read,
723 .pg_test = filelayout_pg_test,
724 .pg_doio = pnfs_generic_pg_readpages,
725};
726
727static const struct nfs_pageio_ops filelayout_pg_write_ops = {
728 .pg_init = filelayout_pg_init_write,
729 .pg_test = filelayout_pg_test,
730 .pg_doio = pnfs_generic_pg_writepages,
731};
732
685static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) 733static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
686{ 734{
687 return !FILELAYOUT_LSEG(lseg)->commit_through_mds; 735 return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
@@ -879,7 +927,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {
879 .owner = THIS_MODULE, 927 .owner = THIS_MODULE,
880 .alloc_lseg = filelayout_alloc_lseg, 928 .alloc_lseg = filelayout_alloc_lseg,
881 .free_lseg = filelayout_free_lseg, 929 .free_lseg = filelayout_free_lseg,
882 .pg_test = filelayout_pg_test, 930 .pg_read_ops = &filelayout_pg_read_ops,
931 .pg_write_ops = &filelayout_pg_write_ops,
883 .mark_pnfs_commit = filelayout_mark_pnfs_commit, 932 .mark_pnfs_commit = filelayout_mark_pnfs_commit,
884 .choose_commit_list = filelayout_choose_commit_list, 933 .choose_commit_list = filelayout_choose_commit_list,
885 .commit_pagelist = filelayout_commit_pagelist, 934 .commit_pagelist = filelayout_commit_pagelist,
@@ -902,5 +951,7 @@ static void __exit nfs4filelayout_exit(void)
902 pnfs_unregister_layoutdriver(&filelayout_type); 951 pnfs_unregister_layoutdriver(&filelayout_type);
903} 952}
904 953
954MODULE_ALIAS("nfs-layouttype4-1");
955
905module_init(nfs4filelayout_init); 956module_init(nfs4filelayout_init);
906module_exit(nfs4filelayout_exit); 957module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index cebe01e3795..2e42284253f 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -47,10 +47,17 @@ enum stripetype4 {
47}; 47};
48 48
49/* Individual ip address */ 49/* Individual ip address */
50struct nfs4_pnfs_ds_addr {
51 struct sockaddr_storage da_addr;
52 size_t da_addrlen;
53 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
54 char *da_remotestr; /* human readable addr+port */
55};
56
50struct nfs4_pnfs_ds { 57struct nfs4_pnfs_ds {
51 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ 58 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
52 u32 ds_ip_addr; 59 char *ds_remotestr; /* comma sep list of addrs */
53 u32 ds_port; 60 struct list_head ds_addrs;
54 struct nfs_client *ds_clp; 61 struct nfs_client *ds_clp;
55 atomic_t ds_count; 62 atomic_t ds_count;
56}; 63};
@@ -89,6 +96,12 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
89 generic_hdr); 96 generic_hdr);
90} 97}
91 98
99static inline struct nfs4_deviceid_node *
100FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
101{
102 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
103}
104
92extern struct nfs_fh * 105extern struct nfs_fh *
93nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 106nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
94 107
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 3b7bf137726..ed388aae968 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -56,54 +56,139 @@ print_ds(struct nfs4_pnfs_ds *ds)
56 printk("%s NULL device\n", __func__); 56 printk("%s NULL device\n", __func__);
57 return; 57 return;
58 } 58 }
59 printk(" ip_addr %x port %hu\n" 59 printk(" ds %s\n"
60 " ref count %d\n" 60 " ref count %d\n"
61 " client %p\n" 61 " client %p\n"
62 " cl_exchange_flags %x\n", 62 " cl_exchange_flags %x\n",
63 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), 63 ds->ds_remotestr,
64 atomic_read(&ds->ds_count), ds->ds_clp, 64 atomic_read(&ds->ds_count), ds->ds_clp,
65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); 65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
66} 66}
67 67
68/* nfs4_ds_cache_lock is held */ 68static bool
69static struct nfs4_pnfs_ds * 69same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
70_data_server_lookup_locked(u32 ip_addr, u32 port)
71{ 70{
72 struct nfs4_pnfs_ds *ds; 71 struct sockaddr_in *a, *b;
72 struct sockaddr_in6 *a6, *b6;
73
74 if (addr1->sa_family != addr2->sa_family)
75 return false;
76
77 switch (addr1->sa_family) {
78 case AF_INET:
79 a = (struct sockaddr_in *)addr1;
80 b = (struct sockaddr_in *)addr2;
81
82 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
83 a->sin_port == b->sin_port)
84 return true;
85 break;
86
87 case AF_INET6:
88 a6 = (struct sockaddr_in6 *)addr1;
89 b6 = (struct sockaddr_in6 *)addr2;
90
91 /* LINKLOCAL addresses must have matching scope_id */
92 if (ipv6_addr_scope(&a6->sin6_addr) ==
93 IPV6_ADDR_SCOPE_LINKLOCAL &&
94 a6->sin6_scope_id != b6->sin6_scope_id)
95 return false;
96
97 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
98 a6->sin6_port == b6->sin6_port)
99 return true;
100 break;
101
102 default:
103 dprintk("%s: unhandled address family: %u\n",
104 __func__, addr1->sa_family);
105 return false;
106 }
73 107
74 dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", 108 return false;
75 ntohl(ip_addr), ntohs(port)); 109}
76 110
77 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { 111/*
78 if (ds->ds_ip_addr == ip_addr && 112 * Lookup DS by addresses. The first matching address returns true.
79 ds->ds_port == port) { 113 * nfs4_ds_cache_lock is held
80 return ds; 114 */
115static struct nfs4_pnfs_ds *
116_data_server_lookup_locked(struct list_head *dsaddrs)
117{
118 struct nfs4_pnfs_ds *ds;
119 struct nfs4_pnfs_ds_addr *da1, *da2;
120
121 list_for_each_entry(da1, dsaddrs, da_node) {
122 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
123 list_for_each_entry(da2, &ds->ds_addrs, da_node) {
124 if (same_sockaddr(
125 (struct sockaddr *)&da1->da_addr,
126 (struct sockaddr *)&da2->da_addr))
127 return ds;
128 }
81 } 129 }
82 } 130 }
83 return NULL; 131 return NULL;
84} 132}
85 133
86/* 134/*
135 * Compare two lists of addresses.
136 */
137static bool
138_data_server_match_all_addrs_locked(struct list_head *dsaddrs1,
139 struct list_head *dsaddrs2)
140{
141 struct nfs4_pnfs_ds_addr *da1, *da2;
142 size_t count1 = 0,
143 count2 = 0;
144
145 list_for_each_entry(da1, dsaddrs1, da_node)
146 count1++;
147
148 list_for_each_entry(da2, dsaddrs2, da_node) {
149 bool found = false;
150 count2++;
151 list_for_each_entry(da1, dsaddrs1, da_node) {
152 if (same_sockaddr((struct sockaddr *)&da1->da_addr,
153 (struct sockaddr *)&da2->da_addr)) {
154 found = true;
155 break;
156 }
157 }
158 if (!found)
159 return false;
160 }
161
162 return (count1 == count2);
163}
164
165/*
87 * Create an rpc connection to the nfs4_pnfs_ds data server 166 * Create an rpc connection to the nfs4_pnfs_ds data server
88 * Currently only support IPv4 167 * Currently only supports IPv4 and IPv6 addresses
89 */ 168 */
90static int 169static int
91nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) 170nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
92{ 171{
93 struct nfs_client *clp; 172 struct nfs_client *clp = ERR_PTR(-EIO);
94 struct sockaddr_in sin; 173 struct nfs4_pnfs_ds_addr *da;
95 int status = 0; 174 int status = 0;
96 175
97 dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, 176 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
98 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
99 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); 177 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
100 178
101 sin.sin_family = AF_INET; 179 BUG_ON(list_empty(&ds->ds_addrs));
102 sin.sin_addr.s_addr = ds->ds_ip_addr; 180
103 sin.sin_port = ds->ds_port; 181 list_for_each_entry(da, &ds->ds_addrs, da_node) {
182 dprintk("%s: DS %s: trying address %s\n",
183 __func__, ds->ds_remotestr, da->da_remotestr);
184
185 clp = nfs4_set_ds_client(mds_srv->nfs_client,
186 (struct sockaddr *)&da->da_addr,
187 da->da_addrlen, IPPROTO_TCP);
188 if (!IS_ERR(clp))
189 break;
190 }
104 191
105 clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
106 sizeof(sin), IPPROTO_TCP);
107 if (IS_ERR(clp)) { 192 if (IS_ERR(clp)) {
108 status = PTR_ERR(clp); 193 status = PTR_ERR(clp);
109 goto out; 194 goto out;
@@ -115,8 +200,8 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
115 goto out_put; 200 goto out_put;
116 } 201 }
117 ds->ds_clp = clp; 202 ds->ds_clp = clp;
118 dprintk("%s [existing] ip=%x, port=%hu\n", __func__, 203 dprintk("%s [existing] server=%s\n", __func__,
119 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); 204 ds->ds_remotestr);
120 goto out; 205 goto out;
121 } 206 }
122 207
@@ -135,8 +220,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
135 goto out_put; 220 goto out_put;
136 221
137 ds->ds_clp = clp; 222 ds->ds_clp = clp;
138 dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), 223 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
139 ntohs(ds->ds_port));
140out: 224out:
141 return status; 225 return status;
142out_put: 226out_put:
@@ -147,12 +231,25 @@ out_put:
147static void 231static void
148destroy_ds(struct nfs4_pnfs_ds *ds) 232destroy_ds(struct nfs4_pnfs_ds *ds)
149{ 233{
234 struct nfs4_pnfs_ds_addr *da;
235
150 dprintk("--> %s\n", __func__); 236 dprintk("--> %s\n", __func__);
151 ifdebug(FACILITY) 237 ifdebug(FACILITY)
152 print_ds(ds); 238 print_ds(ds);
153 239
154 if (ds->ds_clp) 240 if (ds->ds_clp)
155 nfs_put_client(ds->ds_clp); 241 nfs_put_client(ds->ds_clp);
242
243 while (!list_empty(&ds->ds_addrs)) {
244 da = list_first_entry(&ds->ds_addrs,
245 struct nfs4_pnfs_ds_addr,
246 da_node);
247 list_del_init(&da->da_node);
248 kfree(da->da_remotestr);
249 kfree(da);
250 }
251
252 kfree(ds->ds_remotestr);
156 kfree(ds); 253 kfree(ds);
157} 254}
158 255
@@ -179,31 +276,96 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
179 kfree(dsaddr); 276 kfree(dsaddr);
180} 277}
181 278
279/*
280 * Create a string with a human readable address and port to avoid
281 * complicated setup around many dprinks.
282 */
283static char *
284nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
285{
286 struct nfs4_pnfs_ds_addr *da;
287 char *remotestr;
288 size_t len;
289 char *p;
290
291 len = 3; /* '{', '}' and eol */
292 list_for_each_entry(da, dsaddrs, da_node) {
293 len += strlen(da->da_remotestr) + 1; /* string plus comma */
294 }
295
296 remotestr = kzalloc(len, gfp_flags);
297 if (!remotestr)
298 return NULL;
299
300 p = remotestr;
301 *(p++) = '{';
302 len--;
303 list_for_each_entry(da, dsaddrs, da_node) {
304 size_t ll = strlen(da->da_remotestr);
305
306 if (ll > len)
307 goto out_err;
308
309 memcpy(p, da->da_remotestr, ll);
310 p += ll;
311 len -= ll;
312
313 if (len < 1)
314 goto out_err;
315 (*p++) = ',';
316 len--;
317 }
318 if (len < 2)
319 goto out_err;
320 *(p++) = '}';
321 *p = '\0';
322 return remotestr;
323out_err:
324 kfree(remotestr);
325 return NULL;
326}
327
182static struct nfs4_pnfs_ds * 328static struct nfs4_pnfs_ds *
183nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) 329nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
184{ 330{
185 struct nfs4_pnfs_ds *tmp_ds, *ds; 331 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
332 char *remotestr;
186 333
187 ds = kzalloc(sizeof(*tmp_ds), gfp_flags); 334 if (list_empty(dsaddrs)) {
335 dprintk("%s: no addresses defined\n", __func__);
336 goto out;
337 }
338
339 ds = kzalloc(sizeof(*ds), gfp_flags);
188 if (!ds) 340 if (!ds)
189 goto out; 341 goto out;
190 342
343 /* this is only used for debugging, so it's ok if its NULL */
344 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
345
191 spin_lock(&nfs4_ds_cache_lock); 346 spin_lock(&nfs4_ds_cache_lock);
192 tmp_ds = _data_server_lookup_locked(ip_addr, port); 347 tmp_ds = _data_server_lookup_locked(dsaddrs);
193 if (tmp_ds == NULL) { 348 if (tmp_ds == NULL) {
194 ds->ds_ip_addr = ip_addr; 349 INIT_LIST_HEAD(&ds->ds_addrs);
195 ds->ds_port = port; 350 list_splice_init(dsaddrs, &ds->ds_addrs);
351 ds->ds_remotestr = remotestr;
196 atomic_set(&ds->ds_count, 1); 352 atomic_set(&ds->ds_count, 1);
197 INIT_LIST_HEAD(&ds->ds_node); 353 INIT_LIST_HEAD(&ds->ds_node);
198 ds->ds_clp = NULL; 354 ds->ds_clp = NULL;
199 list_add(&ds->ds_node, &nfs4_data_server_cache); 355 list_add(&ds->ds_node, &nfs4_data_server_cache);
200 dprintk("%s add new data server ip 0x%x\n", __func__, 356 dprintk("%s add new data server %s\n", __func__,
201 ds->ds_ip_addr); 357 ds->ds_remotestr);
202 } else { 358 } else {
359 if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs,
360 dsaddrs)) {
361 dprintk("%s: multipath address mismatch: %s != %s",
362 __func__, tmp_ds->ds_remotestr, remotestr);
363 }
364 kfree(remotestr);
203 kfree(ds); 365 kfree(ds);
204 atomic_inc(&tmp_ds->ds_count); 366 atomic_inc(&tmp_ds->ds_count);
205 dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", 367 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
206 __func__, tmp_ds->ds_ip_addr, 368 __func__, tmp_ds->ds_remotestr,
207 atomic_read(&tmp_ds->ds_count)); 369 atomic_read(&tmp_ds->ds_count));
208 ds = tmp_ds; 370 ds = tmp_ds;
209 } 371 }
@@ -213,18 +375,22 @@ out:
213} 375}
214 376
215/* 377/*
216 * Currently only support ipv4, and one multi-path address. 378 * Currently only supports ipv4, ipv6 and one multi-path address.
217 */ 379 */
218static struct nfs4_pnfs_ds * 380static struct nfs4_pnfs_ds_addr *
219decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) 381decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
220{ 382{
221 struct nfs4_pnfs_ds *ds = NULL; 383 struct nfs4_pnfs_ds_addr *da = NULL;
222 char *buf; 384 char *buf, *portstr;
223 const char *ipend, *pstr; 385 u32 port;
224 u32 ip_addr, port; 386 int nlen, rlen;
225 int nlen, rlen, i;
226 int tmp[2]; 387 int tmp[2];
227 __be32 *p; 388 __be32 *p;
389 char *netid, *match_netid;
390 size_t len, match_netid_len;
391 char *startsep = "";
392 char *endsep = "";
393
228 394
229 /* r_netid */ 395 /* r_netid */
230 p = xdr_inline_decode(streamp, 4); 396 p = xdr_inline_decode(streamp, 4);
@@ -236,64 +402,123 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla
236 if (unlikely(!p)) 402 if (unlikely(!p))
237 goto out_err; 403 goto out_err;
238 404
239 /* Check that netid is "tcp" */ 405 netid = kmalloc(nlen+1, gfp_flags);
240 if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { 406 if (unlikely(!netid))
241 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
242 goto out_err; 407 goto out_err;
243 }
244 408
245 /* r_addr */ 409 netid[nlen] = '\0';
410 memcpy(netid, p, nlen);
411
412 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
246 p = xdr_inline_decode(streamp, 4); 413 p = xdr_inline_decode(streamp, 4);
247 if (unlikely(!p)) 414 if (unlikely(!p))
248 goto out_err; 415 goto out_free_netid;
249 rlen = be32_to_cpup(p); 416 rlen = be32_to_cpup(p);
250 417
251 p = xdr_inline_decode(streamp, rlen); 418 p = xdr_inline_decode(streamp, rlen);
252 if (unlikely(!p)) 419 if (unlikely(!p))
253 goto out_err; 420 goto out_free_netid;
254 421
255 /* ipv6 length plus port is legal */ 422 /* port is ".ABC.DEF", 8 chars max */
256 if (rlen > INET6_ADDRSTRLEN + 8) { 423 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
257 dprintk("%s: Invalid address, length %d\n", __func__, 424 dprintk("%s: Invalid address, length %d\n", __func__,
258 rlen); 425 rlen);
259 goto out_err; 426 goto out_free_netid;
260 } 427 }
261 buf = kmalloc(rlen + 1, gfp_flags); 428 buf = kmalloc(rlen + 1, gfp_flags);
262 if (!buf) { 429 if (!buf) {
263 dprintk("%s: Not enough memory\n", __func__); 430 dprintk("%s: Not enough memory\n", __func__);
264 goto out_err; 431 goto out_free_netid;
265 } 432 }
266 buf[rlen] = '\0'; 433 buf[rlen] = '\0';
267 memcpy(buf, p, rlen); 434 memcpy(buf, p, rlen);
268 435
269 /* replace the port dots with dashes for the in4_pton() delimiter*/ 436 /* replace port '.' with '-' */
270 for (i = 0; i < 2; i++) { 437 portstr = strrchr(buf, '.');
271 char *res = strrchr(buf, '.'); 438 if (!portstr) {
272 if (!res) { 439 dprintk("%s: Failed finding expected dot in port\n",
273 dprintk("%s: Failed finding expected dots in port\n", 440 __func__);
274 __func__); 441 goto out_free_buf;
275 goto out_free; 442 }
276 } 443 *portstr = '-';
277 *res = '-'; 444
445 /* find '.' between address and port */
446 portstr = strrchr(buf, '.');
447 if (!portstr) {
448 dprintk("%s: Failed finding expected dot between address and "
449 "port\n", __func__);
450 goto out_free_buf;
278 } 451 }
452 *portstr = '\0';
279 453
280 /* Currently only support ipv4 address */ 454 da = kzalloc(sizeof(*da), gfp_flags);
281 if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { 455 if (unlikely(!da))
282 dprintk("%s: Only ipv4 addresses supported\n", __func__); 456 goto out_free_buf;
283 goto out_free; 457
458 INIT_LIST_HEAD(&da->da_node);
459
460 if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr,
461 sizeof(da->da_addr))) {
462 dprintk("%s: error parsing address %s\n", __func__, buf);
463 goto out_free_da;
284 } 464 }
285 465
286 /* port */ 466 portstr++;
287 pstr = ipend; 467 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
288 sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
289 port = htons((tmp[0] << 8) | (tmp[1])); 468 port = htons((tmp[0] << 8) | (tmp[1]));
290 469
291 ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); 470 switch (da->da_addr.ss_family) {
292 dprintk("%s: Decoded address and port %s\n", __func__, buf); 471 case AF_INET:
293out_free: 472 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
473 da->da_addrlen = sizeof(struct sockaddr_in);
474 match_netid = "tcp";
475 match_netid_len = 3;
476 break;
477
478 case AF_INET6:
479 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
480 da->da_addrlen = sizeof(struct sockaddr_in6);
481 match_netid = "tcp6";
482 match_netid_len = 4;
483 startsep = "[";
484 endsep = "]";
485 break;
486
487 default:
488 dprintk("%s: unsupported address family: %u\n",
489 __func__, da->da_addr.ss_family);
490 goto out_free_da;
491 }
492
493 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
494 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
495 __func__, netid, match_netid);
496 goto out_free_da;
497 }
498
499 /* save human readable address */
500 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
501 da->da_remotestr = kzalloc(len, gfp_flags);
502
503 /* NULL is ok, only used for dprintk */
504 if (da->da_remotestr)
505 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
506 buf, endsep, ntohs(port));
507
508 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
294 kfree(buf); 509 kfree(buf);
510 kfree(netid);
511 return da;
512
513out_free_da:
514 kfree(da);
515out_free_buf:
516 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
517 kfree(buf);
518out_free_netid:
519 kfree(netid);
295out_err: 520out_err:
296 return ds; 521 return NULL;
297} 522}
298 523
299/* Decode opaque device data and return the result */ 524/* Decode opaque device data and return the result */
@@ -310,6 +535,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
310 struct xdr_stream stream; 535 struct xdr_stream stream;
311 struct xdr_buf buf; 536 struct xdr_buf buf;
312 struct page *scratch; 537 struct page *scratch;
538 struct list_head dsaddrs;
539 struct nfs4_pnfs_ds_addr *da;
313 540
314 /* set up xdr stream */ 541 /* set up xdr stream */
315 scratch = alloc_page(gfp_flags); 542 scratch = alloc_page(gfp_flags);
@@ -386,6 +613,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
386 NFS_SERVER(ino)->nfs_client, 613 NFS_SERVER(ino)->nfs_client,
387 &pdev->dev_id); 614 &pdev->dev_id);
388 615
616 INIT_LIST_HEAD(&dsaddrs);
617
389 for (i = 0; i < dsaddr->ds_num; i++) { 618 for (i = 0; i < dsaddr->ds_num; i++) {
390 int j; 619 int j;
391 u32 mp_count; 620 u32 mp_count;
@@ -395,48 +624,43 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
395 goto out_err_free_deviceid; 624 goto out_err_free_deviceid;
396 625
397 mp_count = be32_to_cpup(p); /* multipath count */ 626 mp_count = be32_to_cpup(p); /* multipath count */
398 if (mp_count > 1) {
399 printk(KERN_WARNING
400 "%s: Multipath count %d not supported, "
401 "skipping all greater than 1\n", __func__,
402 mp_count);
403 }
404 for (j = 0; j < mp_count; j++) { 627 for (j = 0; j < mp_count; j++) {
405 if (j == 0) { 628 da = decode_ds_addr(&stream, gfp_flags);
406 dsaddr->ds_list[i] = decode_and_add_ds(&stream, 629 if (da)
407 ino, gfp_flags); 630 list_add_tail(&da->da_node, &dsaddrs);
408 if (dsaddr->ds_list[i] == NULL) 631 }
409 goto out_err_free_deviceid; 632 if (list_empty(&dsaddrs)) {
410 } else { 633 dprintk("%s: no suitable DS addresses found\n",
411 u32 len; 634 __func__);
412 /* skip extra multipath */ 635 goto out_err_free_deviceid;
413 636 }
414 /* read len, skip */ 637
415 p = xdr_inline_decode(&stream, 4); 638 dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
416 if (unlikely(!p)) 639 if (!dsaddr->ds_list[i])
417 goto out_err_free_deviceid; 640 goto out_err_drain_dsaddrs;
418 len = be32_to_cpup(p); 641
419 642 /* If DS was already in cache, free ds addrs */
420 p = xdr_inline_decode(&stream, len); 643 while (!list_empty(&dsaddrs)) {
421 if (unlikely(!p)) 644 da = list_first_entry(&dsaddrs,
422 goto out_err_free_deviceid; 645 struct nfs4_pnfs_ds_addr,
423 646 da_node);
424 /* read len, skip */ 647 list_del_init(&da->da_node);
425 p = xdr_inline_decode(&stream, 4); 648 kfree(da->da_remotestr);
426 if (unlikely(!p)) 649 kfree(da);
427 goto out_err_free_deviceid;
428 len = be32_to_cpup(p);
429
430 p = xdr_inline_decode(&stream, len);
431 if (unlikely(!p))
432 goto out_err_free_deviceid;
433 }
434 } 650 }
435 } 651 }
436 652
437 __free_page(scratch); 653 __free_page(scratch);
438 return dsaddr; 654 return dsaddr;
439 655
656out_err_drain_dsaddrs:
657 while (!list_empty(&dsaddrs)) {
658 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
659 da_node);
660 list_del_init(&da->da_node);
661 kfree(da->da_remotestr);
662 kfree(da);
663 }
440out_err_free_deviceid: 664out_err_free_deviceid:
441 nfs4_fl_free_deviceid(dsaddr); 665 nfs4_fl_free_deviceid(dsaddr);
442 /* stripe_indicies was part of dsaddr */ 666 /* stripe_indicies was part of dsaddr */
@@ -591,13 +815,13 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
591 815
592static void 816static void
593filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, 817filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
594 int err, u32 ds_addr) 818 int err, const char *ds_remotestr)
595{ 819{
596 u32 *p = (u32 *)&dsaddr->id_node.deviceid; 820 u32 *p = (u32 *)&dsaddr->id_node.deviceid;
597 821
598 printk(KERN_ERR "NFS: data server %x connection error %d." 822 printk(KERN_ERR "NFS: data server %s connection error %d."
599 " Deviceid [%x%x%x%x] marked out of use.\n", 823 " Deviceid [%x%x%x%x] marked out of use.\n",
600 ds_addr, err, p[0], p[1], p[2], p[3]); 824 ds_remotestr, err, p[0], p[1], p[2], p[3]);
601 825
602 spin_lock(&nfs4_ds_cache_lock); 826 spin_lock(&nfs4_ds_cache_lock);
603 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; 827 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
@@ -628,7 +852,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
628 err = nfs4_ds_connect(s, ds); 852 err = nfs4_ds_connect(s, ds);
629 if (err) { 853 if (err) {
630 filelayout_mark_devid_negative(dsaddr, err, 854 filelayout_mark_devid_negative(dsaddr, err,
631 ntohl(ds->ds_ip_addr)); 855 ds->ds_remotestr);
632 return NULL; 856 return NULL;
633 } 857 }
634 } 858 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5879b23e0c9..003cb6955a2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -80,7 +80,10 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
80static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 80static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
81 struct nfs_fattr *fattr, struct iattr *sattr, 81 struct nfs_fattr *fattr, struct iattr *sattr,
82 struct nfs4_state *state); 82 struct nfs4_state *state);
83 83#ifdef CONFIG_NFS_V4_1
84static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *);
85static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *);
86#endif
84/* Prevent leaks of NFSv4 errors into userland */ 87/* Prevent leaks of NFSv4 errors into userland */
85static int nfs4_map_errors(int err) 88static int nfs4_map_errors(int err)
86{ 89{
@@ -137,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
137 0 140 0
138}; 141};
139 142
140const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE 143const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
141 | FATTR4_WORD0_MAXREAD 144 | FATTR4_WORD0_MAXREAD
142 | FATTR4_WORD0_MAXWRITE 145 | FATTR4_WORD0_MAXWRITE
143 | FATTR4_WORD0_LEASE_TIME, 146 | FATTR4_WORD0_LEASE_TIME,
144 FATTR4_WORD1_TIME_DELTA 147 FATTR4_WORD1_TIME_DELTA
145 | FATTR4_WORD1_FS_LAYOUT_TYPES 148 | FATTR4_WORD1_FS_LAYOUT_TYPES,
149 FATTR4_WORD2_LAYOUT_BLKSIZE
146}; 150};
147 151
148const u32 nfs4_fs_locations_bitmap[2] = { 152const u32 nfs4_fs_locations_bitmap[2] = {
@@ -763,8 +767,8 @@ struct nfs4_opendata {
763 struct nfs_open_confirmres c_res; 767 struct nfs_open_confirmres c_res;
764 struct nfs_fattr f_attr; 768 struct nfs_fattr f_attr;
765 struct nfs_fattr dir_attr; 769 struct nfs_fattr dir_attr;
766 struct path path;
767 struct dentry *dir; 770 struct dentry *dir;
771 struct dentry *dentry;
768 struct nfs4_state_owner *owner; 772 struct nfs4_state_owner *owner;
769 struct nfs4_state *state; 773 struct nfs4_state *state;
770 struct iattr attrs; 774 struct iattr attrs;
@@ -786,12 +790,12 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
786 nfs_fattr_init(&p->dir_attr); 790 nfs_fattr_init(&p->dir_attr);
787} 791}
788 792
789static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, 793static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
790 struct nfs4_state_owner *sp, fmode_t fmode, int flags, 794 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
791 const struct iattr *attrs, 795 const struct iattr *attrs,
792 gfp_t gfp_mask) 796 gfp_t gfp_mask)
793{ 797{
794 struct dentry *parent = dget_parent(path->dentry); 798 struct dentry *parent = dget_parent(dentry);
795 struct inode *dir = parent->d_inode; 799 struct inode *dir = parent->d_inode;
796 struct nfs_server *server = NFS_SERVER(dir); 800 struct nfs_server *server = NFS_SERVER(dir);
797 struct nfs4_opendata *p; 801 struct nfs4_opendata *p;
@@ -802,8 +806,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
802 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); 806 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
803 if (p->o_arg.seqid == NULL) 807 if (p->o_arg.seqid == NULL)
804 goto err_free; 808 goto err_free;
805 path_get(path); 809 nfs_sb_active(dentry->d_sb);
806 p->path = *path; 810 p->dentry = dget(dentry);
807 p->dir = parent; 811 p->dir = parent;
808 p->owner = sp; 812 p->owner = sp;
809 atomic_inc(&sp->so_count); 813 atomic_inc(&sp->so_count);
@@ -812,7 +816,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
812 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); 816 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
813 p->o_arg.clientid = server->nfs_client->cl_clientid; 817 p->o_arg.clientid = server->nfs_client->cl_clientid;
814 p->o_arg.id = sp->so_owner_id.id; 818 p->o_arg.id = sp->so_owner_id.id;
815 p->o_arg.name = &p->path.dentry->d_name; 819 p->o_arg.name = &dentry->d_name;
816 p->o_arg.server = server; 820 p->o_arg.server = server;
817 p->o_arg.bitmask = server->attr_bitmask; 821 p->o_arg.bitmask = server->attr_bitmask;
818 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 822 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
@@ -842,13 +846,15 @@ static void nfs4_opendata_free(struct kref *kref)
842{ 846{
843 struct nfs4_opendata *p = container_of(kref, 847 struct nfs4_opendata *p = container_of(kref,
844 struct nfs4_opendata, kref); 848 struct nfs4_opendata, kref);
849 struct super_block *sb = p->dentry->d_sb;
845 850
846 nfs_free_seqid(p->o_arg.seqid); 851 nfs_free_seqid(p->o_arg.seqid);
847 if (p->state != NULL) 852 if (p->state != NULL)
848 nfs4_put_open_state(p->state); 853 nfs4_put_open_state(p->state);
849 nfs4_put_state_owner(p->owner); 854 nfs4_put_state_owner(p->owner);
850 dput(p->dir); 855 dput(p->dir);
851 path_put(&p->path); 856 dput(p->dentry);
857 nfs_sb_deactive(sb);
852 kfree(p); 858 kfree(p);
853} 859}
854 860
@@ -1130,7 +1136,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
1130{ 1136{
1131 struct nfs4_opendata *opendata; 1137 struct nfs4_opendata *opendata;
1132 1138
1133 opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); 1139 opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS);
1134 if (opendata == NULL) 1140 if (opendata == NULL)
1135 return ERR_PTR(-ENOMEM); 1141 return ERR_PTR(-ENOMEM);
1136 opendata->state = state; 1142 opendata->state = state;
@@ -1154,7 +1160,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
1154 newstate = nfs4_opendata_to_nfs4_state(opendata); 1160 newstate = nfs4_opendata_to_nfs4_state(opendata);
1155 if (IS_ERR(newstate)) 1161 if (IS_ERR(newstate))
1156 return PTR_ERR(newstate); 1162 return PTR_ERR(newstate);
1157 nfs4_close_state(&opendata->path, newstate, fmode); 1163 nfs4_close_state(newstate, fmode);
1158 *res = newstate; 1164 *res = newstate;
1159 return 0; 1165 return 0;
1160} 1166}
@@ -1352,7 +1358,7 @@ static void nfs4_open_confirm_release(void *calldata)
1352 goto out_free; 1358 goto out_free;
1353 state = nfs4_opendata_to_nfs4_state(data); 1359 state = nfs4_opendata_to_nfs4_state(data);
1354 if (!IS_ERR(state)) 1360 if (!IS_ERR(state))
1355 nfs4_close_state(&data->path, state, data->o_arg.fmode); 1361 nfs4_close_state(state, data->o_arg.fmode);
1356out_free: 1362out_free:
1357 nfs4_opendata_put(data); 1363 nfs4_opendata_put(data);
1358} 1364}
@@ -1497,7 +1503,7 @@ static void nfs4_open_release(void *calldata)
1497 goto out_free; 1503 goto out_free;
1498 state = nfs4_opendata_to_nfs4_state(data); 1504 state = nfs4_opendata_to_nfs4_state(data);
1499 if (!IS_ERR(state)) 1505 if (!IS_ERR(state))
1500 nfs4_close_state(&data->path, state, data->o_arg.fmode); 1506 nfs4_close_state(state, data->o_arg.fmode);
1501out_free: 1507out_free:
1502 nfs4_opendata_put(data); 1508 nfs4_opendata_put(data);
1503} 1509}
@@ -1648,7 +1654,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
1648 return PTR_ERR(opendata); 1654 return PTR_ERR(opendata);
1649 ret = nfs4_open_recover(opendata, state); 1655 ret = nfs4_open_recover(opendata, state);
1650 if (ret == -ESTALE) 1656 if (ret == -ESTALE)
1651 d_drop(ctx->path.dentry); 1657 d_drop(ctx->dentry);
1652 nfs4_opendata_put(opendata); 1658 nfs4_opendata_put(opendata);
1653 return ret; 1659 return ret;
1654} 1660}
@@ -1687,6 +1693,20 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
1687 return ret; 1693 return ret;
1688} 1694}
1689 1695
1696#if defined(CONFIG_NFS_V4_1)
1697static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
1698{
1699 int status;
1700 struct nfs_server *server = NFS_SERVER(state->inode);
1701
1702 status = nfs41_test_stateid(server, state);
1703 if (status == NFS_OK)
1704 return 0;
1705 nfs41_free_stateid(server, state);
1706 return nfs4_open_expired(sp, state);
1707}
1708#endif
1709
1690/* 1710/*
1691 * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* 1711 * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
1692 * fields corresponding to attributes that were used to store the verifier. 1712 * fields corresponding to attributes that were used to store the verifier.
@@ -1706,7 +1726,7 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
1706/* 1726/*
1707 * Returns a referenced nfs4_state 1727 * Returns a referenced nfs4_state
1708 */ 1728 */
1709static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) 1729static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
1710{ 1730{
1711 struct nfs4_state_owner *sp; 1731 struct nfs4_state_owner *sp;
1712 struct nfs4_state *state = NULL; 1732 struct nfs4_state *state = NULL;
@@ -1723,15 +1743,15 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1723 status = nfs4_recover_expired_lease(server); 1743 status = nfs4_recover_expired_lease(server);
1724 if (status != 0) 1744 if (status != 0)
1725 goto err_put_state_owner; 1745 goto err_put_state_owner;
1726 if (path->dentry->d_inode != NULL) 1746 if (dentry->d_inode != NULL)
1727 nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); 1747 nfs4_return_incompatible_delegation(dentry->d_inode, fmode);
1728 status = -ENOMEM; 1748 status = -ENOMEM;
1729 opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); 1749 opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, GFP_KERNEL);
1730 if (opendata == NULL) 1750 if (opendata == NULL)
1731 goto err_put_state_owner; 1751 goto err_put_state_owner;
1732 1752
1733 if (path->dentry->d_inode != NULL) 1753 if (dentry->d_inode != NULL)
1734 opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); 1754 opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
1735 1755
1736 status = _nfs4_proc_open(opendata); 1756 status = _nfs4_proc_open(opendata);
1737 if (status != 0) 1757 if (status != 0)
@@ -1769,14 +1789,14 @@ out_err:
1769} 1789}
1770 1790
1771 1791
1772static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) 1792static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
1773{ 1793{
1774 struct nfs4_exception exception = { }; 1794 struct nfs4_exception exception = { };
1775 struct nfs4_state *res; 1795 struct nfs4_state *res;
1776 int status; 1796 int status;
1777 1797
1778 do { 1798 do {
1779 status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res); 1799 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res);
1780 if (status == 0) 1800 if (status == 0)
1781 break; 1801 break;
1782 /* NOTE: BAD_SEQID means the server and client disagree about the 1802 /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1873,7 +1893,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1873} 1893}
1874 1894
1875struct nfs4_closedata { 1895struct nfs4_closedata {
1876 struct path path;
1877 struct inode *inode; 1896 struct inode *inode;
1878 struct nfs4_state *state; 1897 struct nfs4_state *state;
1879 struct nfs_closeargs arg; 1898 struct nfs_closeargs arg;
@@ -1888,13 +1907,14 @@ static void nfs4_free_closedata(void *data)
1888{ 1907{
1889 struct nfs4_closedata *calldata = data; 1908 struct nfs4_closedata *calldata = data;
1890 struct nfs4_state_owner *sp = calldata->state->owner; 1909 struct nfs4_state_owner *sp = calldata->state->owner;
1910 struct super_block *sb = calldata->state->inode->i_sb;
1891 1911
1892 if (calldata->roc) 1912 if (calldata->roc)
1893 pnfs_roc_release(calldata->state->inode); 1913 pnfs_roc_release(calldata->state->inode);
1894 nfs4_put_open_state(calldata->state); 1914 nfs4_put_open_state(calldata->state);
1895 nfs_free_seqid(calldata->arg.seqid); 1915 nfs_free_seqid(calldata->arg.seqid);
1896 nfs4_put_state_owner(sp); 1916 nfs4_put_state_owner(sp);
1897 path_put(&calldata->path); 1917 nfs_sb_deactive(sb);
1898 kfree(calldata); 1918 kfree(calldata);
1899} 1919}
1900 1920
@@ -2014,7 +2034,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
2014 * 2034 *
2015 * NOTE: Caller must be holding the sp->so_owner semaphore! 2035 * NOTE: Caller must be holding the sp->so_owner semaphore!
2016 */ 2036 */
2017int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) 2037int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
2018{ 2038{
2019 struct nfs_server *server = NFS_SERVER(state->inode); 2039 struct nfs_server *server = NFS_SERVER(state->inode);
2020 struct nfs4_closedata *calldata; 2040 struct nfs4_closedata *calldata;
@@ -2050,8 +2070,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
2050 calldata->res.seqid = calldata->arg.seqid; 2070 calldata->res.seqid = calldata->arg.seqid;
2051 calldata->res.server = server; 2071 calldata->res.server = server;
2052 calldata->roc = roc; 2072 calldata->roc = roc;
2053 path_get(path); 2073 nfs_sb_active(calldata->inode->i_sb);
2054 calldata->path = *path;
2055 2074
2056 msg.rpc_argp = &calldata->arg; 2075 msg.rpc_argp = &calldata->arg;
2057 msg.rpc_resp = &calldata->res; 2076 msg.rpc_resp = &calldata->res;
@@ -2080,7 +2099,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
2080 struct nfs4_state *state; 2099 struct nfs4_state *state;
2081 2100
2082 /* Protect against concurrent sillydeletes */ 2101 /* Protect against concurrent sillydeletes */
2083 state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred); 2102 state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, ctx->cred);
2084 if (IS_ERR(state)) 2103 if (IS_ERR(state))
2085 return ERR_CAST(state); 2104 return ERR_CAST(state);
2086 ctx->state = state; 2105 ctx->state = state;
@@ -2092,9 +2111,9 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
2092 if (ctx->state == NULL) 2111 if (ctx->state == NULL)
2093 return; 2112 return;
2094 if (is_sync) 2113 if (is_sync)
2095 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); 2114 nfs4_close_sync(ctx->state, ctx->mode);
2096 else 2115 else
2097 nfs4_close_state(&ctx->path, ctx->state, ctx->mode); 2116 nfs4_close_state(ctx->state, ctx->mode);
2098} 2117}
2099 2118
2100static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 2119static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
@@ -2251,13 +2270,14 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
2251static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, 2270static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
2252 struct nfs_fsinfo *info) 2271 struct nfs_fsinfo *info)
2253{ 2272{
2273 int minor_version = server->nfs_client->cl_minorversion;
2254 int status = nfs4_lookup_root(server, fhandle, info); 2274 int status = nfs4_lookup_root(server, fhandle, info);
2255 if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) 2275 if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))
2256 /* 2276 /*
2257 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM 2277 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM
2258 * by nfs4_map_errors() as this function exits. 2278 * by nfs4_map_errors() as this function exits.
2259 */ 2279 */
2260 status = nfs4_find_root_sec(server, fhandle, info); 2280 status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info);
2261 if (status == 0) 2281 if (status == 0)
2262 status = nfs4_server_capabilities(server, fhandle); 2282 status = nfs4_server_capabilities(server, fhandle);
2263 if (status == 0) 2283 if (status == 0)
@@ -2616,10 +2636,7 @@ static int
2616nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 2636nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2617 int flags, struct nfs_open_context *ctx) 2637 int flags, struct nfs_open_context *ctx)
2618{ 2638{
2619 struct path my_path = { 2639 struct dentry *de = dentry;
2620 .dentry = dentry,
2621 };
2622 struct path *path = &my_path;
2623 struct nfs4_state *state; 2640 struct nfs4_state *state;
2624 struct rpc_cred *cred = NULL; 2641 struct rpc_cred *cred = NULL;
2625 fmode_t fmode = 0; 2642 fmode_t fmode = 0;
@@ -2627,11 +2644,11 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2627 2644
2628 if (ctx != NULL) { 2645 if (ctx != NULL) {
2629 cred = ctx->cred; 2646 cred = ctx->cred;
2630 path = &ctx->path; 2647 de = ctx->dentry;
2631 fmode = ctx->mode; 2648 fmode = ctx->mode;
2632 } 2649 }
2633 sattr->ia_mode &= ~current_umask(); 2650 sattr->ia_mode &= ~current_umask();
2634 state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); 2651 state = nfs4_do_open(dir, de, fmode, flags, sattr, cred);
2635 d_drop(dentry); 2652 d_drop(dentry);
2636 if (IS_ERR(state)) { 2653 if (IS_ERR(state)) {
2637 status = PTR_ERR(state); 2654 status = PTR_ERR(state);
@@ -2642,7 +2659,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
2642 if (ctx != NULL) 2659 if (ctx != NULL)
2643 ctx->state = state; 2660 ctx->state = state;
2644 else 2661 else
2645 nfs4_close_sync(path, state, fmode); 2662 nfs4_close_sync(state, fmode);
2646out: 2663out:
2647 return status; 2664 return status;
2648} 2665}
@@ -3357,9 +3374,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3357 3374
3358 if (task->tk_status < 0) { 3375 if (task->tk_status < 0) {
3359 /* Unless we're shutting down, schedule state recovery! */ 3376 /* Unless we're shutting down, schedule state recovery! */
3360 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) 3377 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
3378 return;
3379 if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
3361 nfs4_schedule_lease_recovery(clp); 3380 nfs4_schedule_lease_recovery(clp);
3362 return; 3381 return;
3382 }
3383 nfs4_schedule_path_down_recovery(clp);
3363 } 3384 }
3364 do_renew_lease(clp, timestamp); 3385 do_renew_lease(clp, timestamp);
3365} 3386}
@@ -3369,7 +3390,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
3369 .rpc_release = nfs4_renew_release, 3390 .rpc_release = nfs4_renew_release,
3370}; 3391};
3371 3392
3372int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) 3393static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
3373{ 3394{
3374 struct rpc_message msg = { 3395 struct rpc_message msg = {
3375 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 3396 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3378,9 +3399,11 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3378 }; 3399 };
3379 struct nfs4_renewdata *data; 3400 struct nfs4_renewdata *data;
3380 3401
3402 if (renew_flags == 0)
3403 return 0;
3381 if (!atomic_inc_not_zero(&clp->cl_count)) 3404 if (!atomic_inc_not_zero(&clp->cl_count))
3382 return -EIO; 3405 return -EIO;
3383 data = kmalloc(sizeof(*data), GFP_KERNEL); 3406 data = kmalloc(sizeof(*data), GFP_NOFS);
3384 if (data == NULL) 3407 if (data == NULL)
3385 return -ENOMEM; 3408 return -ENOMEM;
3386 data->client = clp; 3409 data->client = clp;
@@ -3389,7 +3412,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3389 &nfs4_renew_ops, data); 3412 &nfs4_renew_ops, data);
3390} 3413}
3391 3414
3392int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3415static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
3393{ 3416{
3394 struct rpc_message msg = { 3417 struct rpc_message msg = {
3395 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 3418 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3419,19 +3442,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
3419 */ 3442 */
3420#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) 3443#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
3421 3444
3422static void buf_to_pages(const void *buf, size_t buflen,
3423 struct page **pages, unsigned int *pgbase)
3424{
3425 const void *p = buf;
3426
3427 *pgbase = offset_in_page(buf);
3428 p -= *pgbase;
3429 while (p < buf + buflen) {
3430 *(pages++) = virt_to_page(p);
3431 p += PAGE_CACHE_SIZE;
3432 }
3433}
3434
3435static int buf_to_pages_noslab(const void *buf, size_t buflen, 3445static int buf_to_pages_noslab(const void *buf, size_t buflen,
3436 struct page **pages, unsigned int *pgbase) 3446 struct page **pages, unsigned int *pgbase)
3437{ 3447{
@@ -3528,9 +3538,19 @@ out:
3528 nfs4_set_cached_acl(inode, acl); 3538 nfs4_set_cached_acl(inode, acl);
3529} 3539}
3530 3540
3541/*
3542 * The getxattr API returns the required buffer length when called with a
3543 * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
3544 * the required buf. On a NULL buf, we send a page of data to the server
3545 * guessing that the ACL request can be serviced by a page. If so, we cache
3546 * up to the page of ACL data, and the 2nd call to getxattr is serviced by
3547 * the cache. If not so, we throw away the page, and cache the required
3548 * length. The next getxattr call will then produce another round trip to
3549 * the server, this time with the input buf of the required size.
3550 */
3531static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) 3551static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
3532{ 3552{
3533 struct page *pages[NFS4ACL_MAXPAGES]; 3553 struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
3534 struct nfs_getaclargs args = { 3554 struct nfs_getaclargs args = {
3535 .fh = NFS_FH(inode), 3555 .fh = NFS_FH(inode),
3536 .acl_pages = pages, 3556 .acl_pages = pages,
@@ -3545,41 +3565,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3545 .rpc_argp = &args, 3565 .rpc_argp = &args,
3546 .rpc_resp = &res, 3566 .rpc_resp = &res,
3547 }; 3567 };
3548 struct page *localpage = NULL; 3568 int ret = -ENOMEM, npages, i, acl_len = 0;
3549 int ret;
3550 3569
3551 if (buflen < PAGE_SIZE) { 3570 npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3552 /* As long as we're doing a round trip to the server anyway, 3571 /* As long as we're doing a round trip to the server anyway,
3553 * let's be prepared for a page of acl data. */ 3572 * let's be prepared for a page of acl data. */
3554 localpage = alloc_page(GFP_KERNEL); 3573 if (npages == 0)
3555 resp_buf = page_address(localpage); 3574 npages = 1;
3556 if (localpage == NULL) 3575
3557 return -ENOMEM; 3576 for (i = 0; i < npages; i++) {
3558 args.acl_pages[0] = localpage; 3577 pages[i] = alloc_page(GFP_KERNEL);
3559 args.acl_pgbase = 0; 3578 if (!pages[i])
3560 args.acl_len = PAGE_SIZE; 3579 goto out_free;
3561 } else {
3562 resp_buf = buf;
3563 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
3564 } 3580 }
3565 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); 3581 if (npages > 1) {
3582 /* for decoding across pages */
3583 args.acl_scratch = alloc_page(GFP_KERNEL);
3584 if (!args.acl_scratch)
3585 goto out_free;
3586 }
3587 args.acl_len = npages * PAGE_SIZE;
3588 args.acl_pgbase = 0;
3589 /* Let decode_getfacl know not to fail if the ACL data is larger than
3590 * the page we send as a guess */
3591 if (buf == NULL)
3592 res.acl_flags |= NFS4_ACL_LEN_REQUEST;
3593 resp_buf = page_address(pages[0]);
3594
3595 dprintk("%s buf %p buflen %ld npages %d args.acl_len %ld\n",
3596 __func__, buf, buflen, npages, args.acl_len);
3597 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
3598 &msg, &args.seq_args, &res.seq_res, 0);
3566 if (ret) 3599 if (ret)
3567 goto out_free; 3600 goto out_free;
3568 if (res.acl_len > args.acl_len) 3601
3569 nfs4_write_cached_acl(inode, NULL, res.acl_len); 3602 acl_len = res.acl_len - res.acl_data_offset;
3603 if (acl_len > args.acl_len)
3604 nfs4_write_cached_acl(inode, NULL, acl_len);
3570 else 3605 else
3571 nfs4_write_cached_acl(inode, resp_buf, res.acl_len); 3606 nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset,
3607 acl_len);
3572 if (buf) { 3608 if (buf) {
3573 ret = -ERANGE; 3609 ret = -ERANGE;
3574 if (res.acl_len > buflen) 3610 if (acl_len > buflen)
3575 goto out_free; 3611 goto out_free;
3576 if (localpage) 3612 _copy_from_pages(buf, pages, res.acl_data_offset,
3577 memcpy(buf, resp_buf, res.acl_len); 3613 res.acl_len);
3578 } 3614 }
3579 ret = res.acl_len; 3615 ret = acl_len;
3580out_free: 3616out_free:
3581 if (localpage) 3617 for (i = 0; i < npages; i++)
3582 __free_page(localpage); 3618 if (pages[i])
3619 __free_page(pages[i]);
3620 if (args.acl_scratch)
3621 __free_page(args.acl_scratch);
3583 return ret; 3622 return ret;
3584} 3623}
3585 3624
@@ -3610,6 +3649,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
3610 nfs_zap_acl_cache(inode); 3649 nfs_zap_acl_cache(inode);
3611 ret = nfs4_read_cached_acl(inode, buf, buflen); 3650 ret = nfs4_read_cached_acl(inode, buf, buflen);
3612 if (ret != -ENOENT) 3651 if (ret != -ENOENT)
3652 /* -ENOENT is returned if there is no ACL or if there is an ACL
3653 * but no cached acl data, just the acl length */
3613 return ret; 3654 return ret;
3614 return nfs4_get_acl_uncached(inode, buf, buflen); 3655 return nfs4_get_acl_uncached(inode, buf, buflen);
3615} 3656}
@@ -4294,7 +4335,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
4294 memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, 4335 memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
4295 sizeof(data->lsp->ls_stateid.data)); 4336 sizeof(data->lsp->ls_stateid.data));
4296 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; 4337 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
4297 renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); 4338 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
4298 } 4339 }
4299out: 4340out:
4300 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); 4341 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
@@ -4443,6 +4484,20 @@ out:
4443 return err; 4484 return err;
4444} 4485}
4445 4486
4487#if defined(CONFIG_NFS_V4_1)
4488static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
4489{
4490 int status;
4491 struct nfs_server *server = NFS_SERVER(state->inode);
4492
4493 status = nfs41_test_stateid(server, state);
4494 if (status == NFS_OK)
4495 return 0;
4496 nfs41_free_stateid(server, state);
4497 return nfs4_lock_expired(state, request);
4498}
4499#endif
4500
4446static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 4501static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
4447{ 4502{
4448 struct nfs_inode *nfsi = NFS_I(state->inode); 4503 struct nfs_inode *nfsi = NFS_I(state->inode);
@@ -4781,6 +4836,16 @@ out_inval:
4781 return -NFS4ERR_INVAL; 4836 return -NFS4ERR_INVAL;
4782} 4837}
4783 4838
4839static bool
4840nfs41_same_server_scope(struct server_scope *a, struct server_scope *b)
4841{
4842 if (a->server_scope_sz == b->server_scope_sz &&
4843 memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0)
4844 return true;
4845
4846 return false;
4847}
4848
4784/* 4849/*
4785 * nfs4_proc_exchange_id() 4850 * nfs4_proc_exchange_id()
4786 * 4851 *
@@ -4823,9 +4888,31 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4823 init_utsname()->domainname, 4888 init_utsname()->domainname,
4824 clp->cl_rpcclient->cl_auth->au_flavor); 4889 clp->cl_rpcclient->cl_auth->au_flavor);
4825 4890
4891 res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
4892 if (unlikely(!res.server_scope))
4893 return -ENOMEM;
4894
4826 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 4895 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
4827 if (!status) 4896 if (!status)
4828 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); 4897 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4898
4899 if (!status) {
4900 if (clp->server_scope &&
4901 !nfs41_same_server_scope(clp->server_scope,
4902 res.server_scope)) {
4903 dprintk("%s: server_scope mismatch detected\n",
4904 __func__);
4905 set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
4906 kfree(clp->server_scope);
4907 clp->server_scope = NULL;
4908 }
4909
4910 if (!clp->server_scope)
4911 clp->server_scope = res.server_scope;
4912 else
4913 kfree(res.server_scope);
4914 }
4915
4829 dprintk("<-- %s status= %d\n", __func__, status); 4916 dprintk("<-- %s status= %d\n", __func__, status);
4830 return status; 4917 return status;
4831} 4918}
@@ -5441,11 +5528,13 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
5441 return rpc_run_task(&task_setup_data); 5528 return rpc_run_task(&task_setup_data);
5442} 5529}
5443 5530
5444static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5531static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
5445{ 5532{
5446 struct rpc_task *task; 5533 struct rpc_task *task;
5447 int ret = 0; 5534 int ret = 0;
5448 5535
5536 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
5537 return 0;
5449 task = _nfs41_proc_sequence(clp, cred); 5538 task = _nfs41_proc_sequence(clp, cred);
5450 if (IS_ERR(task)) 5539 if (IS_ERR(task))
5451 ret = PTR_ERR(task); 5540 ret = PTR_ERR(task);
@@ -5706,7 +5795,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
5706{ 5795{
5707 struct nfs4_layoutreturn *lrp = calldata; 5796 struct nfs4_layoutreturn *lrp = calldata;
5708 struct nfs_server *server; 5797 struct nfs_server *server;
5709 struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; 5798 struct pnfs_layout_hdr *lo = lrp->args.layout;
5710 5799
5711 dprintk("--> %s\n", __func__); 5800 dprintk("--> %s\n", __func__);
5712 5801
@@ -5735,7 +5824,7 @@ static void nfs4_layoutreturn_release(void *calldata)
5735 struct nfs4_layoutreturn *lrp = calldata; 5824 struct nfs4_layoutreturn *lrp = calldata;
5736 5825
5737 dprintk("--> %s\n", __func__); 5826 dprintk("--> %s\n", __func__);
5738 put_layout_hdr(NFS_I(lrp->args.inode)->layout); 5827 put_layout_hdr(lrp->args.layout);
5739 kfree(calldata); 5828 kfree(calldata);
5740 dprintk("<-- %s\n", __func__); 5829 dprintk("<-- %s\n", __func__);
5741} 5830}
@@ -5772,6 +5861,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
5772 return status; 5861 return status;
5773} 5862}
5774 5863
5864/*
5865 * Retrieve the list of Data Server devices from the MDS.
5866 */
5867static int _nfs4_getdevicelist(struct nfs_server *server,
5868 const struct nfs_fh *fh,
5869 struct pnfs_devicelist *devlist)
5870{
5871 struct nfs4_getdevicelist_args args = {
5872 .fh = fh,
5873 .layoutclass = server->pnfs_curr_ld->id,
5874 };
5875 struct nfs4_getdevicelist_res res = {
5876 .devlist = devlist,
5877 };
5878 struct rpc_message msg = {
5879 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
5880 .rpc_argp = &args,
5881 .rpc_resp = &res,
5882 };
5883 int status;
5884
5885 dprintk("--> %s\n", __func__);
5886 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
5887 &res.seq_res, 0);
5888 dprintk("<-- %s status=%d\n", __func__, status);
5889 return status;
5890}
5891
5892int nfs4_proc_getdevicelist(struct nfs_server *server,
5893 const struct nfs_fh *fh,
5894 struct pnfs_devicelist *devlist)
5895{
5896 struct nfs4_exception exception = { };
5897 int err;
5898
5899 do {
5900 err = nfs4_handle_exception(server,
5901 _nfs4_getdevicelist(server, fh, devlist),
5902 &exception);
5903 } while (exception.retry);
5904
5905 dprintk("%s: err=%d, num_devs=%u\n", __func__,
5906 err, devlist->num_devs);
5907
5908 return err;
5909}
5910EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
5911
5775static int 5912static int
5776_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 5913_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5777{ 5914{
@@ -5850,9 +5987,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
5850static void nfs4_layoutcommit_release(void *calldata) 5987static void nfs4_layoutcommit_release(void *calldata)
5851{ 5988{
5852 struct nfs4_layoutcommit_data *data = calldata; 5989 struct nfs4_layoutcommit_data *data = calldata;
5990 struct pnfs_layout_segment *lseg, *tmp;
5853 5991
5992 pnfs_cleanup_layoutcommit(data);
5854 /* Matched by references in pnfs_set_layoutcommit */ 5993 /* Matched by references in pnfs_set_layoutcommit */
5855 put_lseg(data->lseg); 5994 list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
5995 list_del_init(&lseg->pls_lc_list);
5996 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
5997 &lseg->pls_flags))
5998 put_lseg(lseg);
5999 }
5856 put_rpccred(data->cred); 6000 put_rpccred(data->cred);
5857 kfree(data); 6001 kfree(data);
5858} 6002}
@@ -5903,6 +6047,143 @@ out:
5903 rpc_put_task(task); 6047 rpc_put_task(task);
5904 return status; 6048 return status;
5905} 6049}
6050
6051static int
6052_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
6053 struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
6054{
6055 struct nfs41_secinfo_no_name_args args = {
6056 .style = SECINFO_STYLE_CURRENT_FH,
6057 };
6058 struct nfs4_secinfo_res res = {
6059 .flavors = flavors,
6060 };
6061 struct rpc_message msg = {
6062 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME],
6063 .rpc_argp = &args,
6064 .rpc_resp = &res,
6065 };
6066 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
6067}
6068
6069static int
6070nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
6071 struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
6072{
6073 struct nfs4_exception exception = { };
6074 int err;
6075 do {
6076 err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
6077 switch (err) {
6078 case 0:
6079 case -NFS4ERR_WRONGSEC:
6080 case -NFS4ERR_NOTSUPP:
6081 break;
6082 default:
6083 err = nfs4_handle_exception(server, err, &exception);
6084 }
6085 } while (exception.retry);
6086 return err;
6087}
6088
6089static int
6090nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
6091 struct nfs_fsinfo *info)
6092{
6093 int err;
6094 struct page *page;
6095 rpc_authflavor_t flavor;
6096 struct nfs4_secinfo_flavors *flavors;
6097
6098 page = alloc_page(GFP_KERNEL);
6099 if (!page) {
6100 err = -ENOMEM;
6101 goto out;
6102 }
6103
6104 flavors = page_address(page);
6105 err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
6106
6107 /*
6108 * Fall back on "guess and check" method if
6109 * the server doesn't support SECINFO_NO_NAME
6110 */
6111 if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) {
6112 err = nfs4_find_root_sec(server, fhandle, info);
6113 goto out_freepage;
6114 }
6115 if (err)
6116 goto out_freepage;
6117
6118 flavor = nfs_find_best_sec(flavors);
6119 if (err == 0)
6120 err = nfs4_lookup_root_sec(server, fhandle, info, flavor);
6121
6122out_freepage:
6123 put_page(page);
6124 if (err == -EACCES)
6125 return -EPERM;
6126out:
6127 return err;
6128}
6129static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
6130{
6131 int status;
6132 struct nfs41_test_stateid_args args = {
6133 .stateid = &state->stateid,
6134 };
6135 struct nfs41_test_stateid_res res;
6136 struct rpc_message msg = {
6137 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
6138 .rpc_argp = &args,
6139 .rpc_resp = &res,
6140 };
6141 args.seq_args.sa_session = res.seq_res.sr_session = NULL;
6142 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
6143 return status;
6144}
6145
6146static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
6147{
6148 struct nfs4_exception exception = { };
6149 int err;
6150 do {
6151 err = nfs4_handle_exception(server,
6152 _nfs41_test_stateid(server, state),
6153 &exception);
6154 } while (exception.retry);
6155 return err;
6156}
6157
6158static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state)
6159{
6160 int status;
6161 struct nfs41_free_stateid_args args = {
6162 .stateid = &state->stateid,
6163 };
6164 struct nfs41_free_stateid_res res;
6165 struct rpc_message msg = {
6166 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
6167 .rpc_argp = &args,
6168 .rpc_resp = &res,
6169 };
6170
6171 args.seq_args.sa_session = res.seq_res.sr_session = NULL;
6172 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
6173 return status;
6174}
6175
6176static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state)
6177{
6178 struct nfs4_exception exception = { };
6179 int err;
6180 do {
6181 err = nfs4_handle_exception(server,
6182 _nfs4_free_stateid(server, state),
6183 &exception);
6184 } while (exception.retry);
6185 return err;
6186}
5906#endif /* CONFIG_NFS_V4_1 */ 6187#endif /* CONFIG_NFS_V4_1 */
5907 6188
5908struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 6189struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5939,8 +6220,8 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
5939struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { 6220struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
5940 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, 6221 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
5941 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, 6222 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
5942 .recover_open = nfs4_open_expired, 6223 .recover_open = nfs41_open_expired,
5943 .recover_lock = nfs4_lock_expired, 6224 .recover_lock = nfs41_lock_expired,
5944 .establish_clid = nfs41_init_clientid, 6225 .establish_clid = nfs41_init_clientid,
5945 .get_clid_cred = nfs4_get_exchange_id_cred, 6226 .get_clid_cred = nfs4_get_exchange_id_cred,
5946}; 6227};
@@ -5964,6 +6245,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
5964 .minor_version = 0, 6245 .minor_version = 0,
5965 .call_sync = _nfs4_call_sync, 6246 .call_sync = _nfs4_call_sync,
5966 .validate_stateid = nfs4_validate_delegation_stateid, 6247 .validate_stateid = nfs4_validate_delegation_stateid,
6248 .find_root_sec = nfs4_find_root_sec,
5967 .reboot_recovery_ops = &nfs40_reboot_recovery_ops, 6249 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
5968 .nograce_recovery_ops = &nfs40_nograce_recovery_ops, 6250 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
5969 .state_renewal_ops = &nfs40_state_renewal_ops, 6251 .state_renewal_ops = &nfs40_state_renewal_ops,
@@ -5974,6 +6256,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
5974 .minor_version = 1, 6256 .minor_version = 1,
5975 .call_sync = _nfs4_call_sync_session, 6257 .call_sync = _nfs4_call_sync_session,
5976 .validate_stateid = nfs41_validate_delegation_stateid, 6258 .validate_stateid = nfs41_validate_delegation_stateid,
6259 .find_root_sec = nfs41_find_root_sec,
5977 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 6260 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
5978 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 6261 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
5979 .state_renewal_ops = &nfs41_state_renewal_ops, 6262 .state_renewal_ops = &nfs41_state_renewal_ops,
@@ -6002,6 +6285,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
6002 .dentry_ops = &nfs4_dentry_operations, 6285 .dentry_ops = &nfs4_dentry_operations,
6003 .dir_inode_ops = &nfs4_dir_inode_operations, 6286 .dir_inode_ops = &nfs4_dir_inode_operations,
6004 .file_inode_ops = &nfs4_file_inode_operations, 6287 .file_inode_ops = &nfs4_file_inode_operations,
6288 .file_ops = &nfs4_file_operations,
6005 .getroot = nfs4_proc_get_root, 6289 .getroot = nfs4_proc_get_root,
6006 .getattr = nfs4_proc_getattr, 6290 .getattr = nfs4_proc_getattr,
6007 .setattr = nfs4_proc_setattr, 6291 .setattr = nfs4_proc_setattr,
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index df8e7f3ca56..dc484c0eae7 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -60,6 +60,7 @@ nfs4_renew_state(struct work_struct *work)
60 struct rpc_cred *cred; 60 struct rpc_cred *cred;
61 long lease; 61 long lease;
62 unsigned long last, now; 62 unsigned long last, now;
63 unsigned renew_flags = 0;
63 64
64 ops = clp->cl_mvops->state_renewal_ops; 65 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 66 dprintk("%s: start\n", __func__);
@@ -72,18 +73,23 @@ nfs4_renew_state(struct work_struct *work)
72 last = clp->cl_last_renewal; 73 last = clp->cl_last_renewal;
73 now = jiffies; 74 now = jiffies;
74 /* Are we close to a lease timeout? */ 75 /* Are we close to a lease timeout? */
75 if (time_after(now, last + lease/3)) { 76 if (time_after(now, last + lease/3))
77 renew_flags |= NFS4_RENEW_TIMEOUT;
78 if (nfs_delegations_present(clp))
79 renew_flags |= NFS4_RENEW_DELEGATION_CB;
80
81 if (renew_flags != 0) {
76 cred = ops->get_state_renewal_cred_locked(clp); 82 cred = ops->get_state_renewal_cred_locked(clp);
77 spin_unlock(&clp->cl_lock); 83 spin_unlock(&clp->cl_lock);
78 if (cred == NULL) { 84 if (cred == NULL) {
79 if (!nfs_delegations_present(clp)) { 85 if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
80 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 86 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
81 goto out; 87 goto out;
82 } 88 }
83 nfs_expire_all_delegations(clp); 89 nfs_expire_all_delegations(clp);
84 } else { 90 } else {
85 /* Queue an asynchronous RENEW. */ 91 /* Queue an asynchronous RENEW. */
86 ops->sched_state_renewal(clp, cred); 92 ops->sched_state_renewal(clp, cred, renew_flags);
87 put_rpccred(cred); 93 put_rpccred(cred);
88 goto out_exp; 94 goto out_exp;
89 } 95 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e97dd219f84..efd84316f6a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -641,7 +641,7 @@ void nfs4_put_open_state(struct nfs4_state *state)
641/* 641/*
642 * Close the current file. 642 * Close the current file.
643 */ 643 */
644static void __nfs4_close(struct path *path, struct nfs4_state *state, 644static void __nfs4_close(struct nfs4_state *state,
645 fmode_t fmode, gfp_t gfp_mask, int wait) 645 fmode_t fmode, gfp_t gfp_mask, int wait)
646{ 646{
647 struct nfs4_state_owner *owner = state->owner; 647 struct nfs4_state_owner *owner = state->owner;
@@ -685,18 +685,18 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
685 } else { 685 } else {
686 bool roc = pnfs_roc(state->inode); 686 bool roc = pnfs_roc(state->inode);
687 687
688 nfs4_do_close(path, state, gfp_mask, wait, roc); 688 nfs4_do_close(state, gfp_mask, wait, roc);
689 } 689 }
690} 690}
691 691
692void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) 692void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
693{ 693{
694 __nfs4_close(path, state, fmode, GFP_NOFS, 0); 694 __nfs4_close(state, fmode, GFP_NOFS, 0);
695} 695}
696 696
697void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) 697void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
698{ 698{
699 __nfs4_close(path, state, fmode, GFP_KERNEL, 1); 699 __nfs4_close(state, fmode, GFP_KERNEL, 1);
700} 700}
701 701
702/* 702/*
@@ -1038,6 +1038,12 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1038 nfs4_schedule_state_manager(clp); 1038 nfs4_schedule_state_manager(clp);
1039} 1039}
1040 1040
1041void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
1042{
1043 nfs_handle_cb_pathdown(clp);
1044 nfs4_schedule_state_manager(clp);
1045}
1046
1041static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) 1047static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
1042{ 1048{
1043 1049
@@ -1519,16 +1525,16 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
1519{ 1525{
1520 if (!flags) 1526 if (!flags)
1521 return; 1527 return;
1522 else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) 1528 if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
1523 nfs41_handle_server_reboot(clp); 1529 nfs41_handle_server_reboot(clp);
1524 else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | 1530 if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
1525 SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | 1531 SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
1526 SEQ4_STATUS_ADMIN_STATE_REVOKED | 1532 SEQ4_STATUS_ADMIN_STATE_REVOKED |
1527 SEQ4_STATUS_LEASE_MOVED)) 1533 SEQ4_STATUS_LEASE_MOVED))
1528 nfs41_handle_state_revoked(clp); 1534 nfs41_handle_state_revoked(clp);
1529 else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) 1535 if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
1530 nfs41_handle_recallable_state_revoked(clp); 1536 nfs41_handle_recallable_state_revoked(clp);
1531 else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | 1537 if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
1532 SEQ4_STATUS_BACKCHANNEL_FAULT | 1538 SEQ4_STATUS_BACKCHANNEL_FAULT |
1533 SEQ4_STATUS_CB_PATH_DOWN_SESSION)) 1539 SEQ4_STATUS_CB_PATH_DOWN_SESSION))
1534 nfs41_handle_cb_path_down(clp); 1540 nfs41_handle_cb_path_down(clp);
@@ -1643,7 +1649,14 @@ static void nfs4_state_manager(struct nfs_client *clp)
1643 goto out_error; 1649 goto out_error;
1644 } 1650 }
1645 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1651 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1646 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); 1652
1653 if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH,
1654 &clp->cl_state))
1655 nfs4_state_start_reclaim_nograce(clp);
1656 else
1657 set_bit(NFS4CLNT_RECLAIM_REBOOT,
1658 &clp->cl_state);
1659
1647 pnfs_destroy_all_layouts(clp); 1660 pnfs_destroy_all_layouts(clp);
1648 } 1661 }
1649 1662
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6e8f3b9a1d..97f987a981c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
113#define encode_restorefh_maxsz (op_encode_hdr_maxsz) 113#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
114#define decode_restorefh_maxsz (op_decode_hdr_maxsz) 114#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
115#define encode_fsinfo_maxsz (encode_getattr_maxsz) 115#define encode_fsinfo_maxsz (encode_getattr_maxsz)
116#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) 116/* The 5 accounts for the PNFS attributes, and assumes that at most three
117 * layout types will be returned.
118 */
119#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
120 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
117#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) 121#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
118#define decode_renew_maxsz (op_decode_hdr_maxsz) 122#define decode_renew_maxsz (op_decode_hdr_maxsz)
119#define encode_setclientid_maxsz \ 123#define encode_setclientid_maxsz \
@@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int);
314 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 318 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
315#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 319#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
316#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 320#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
321#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
322 encode_verifier_maxsz)
323#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
324 2 /* nfs_cookie4 gdlr_cookie */ + \
325 decode_verifier_maxsz \
326 /* verifier4 gdlr_verifier */ + \
327 1 /* gdlr_deviceid_list count */ + \
328 XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
329 NFS4_DEVICEID4_SIZE) \
330 /* gdlr_deviceid_list */ + \
331 1 /* bool gdlr_eof */)
317#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ 332#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
318 XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) 333 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
319#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 334#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -343,6 +358,14 @@ static int nfs4_stat_to_errno(int);
343 1 /* FIXME: opaque lrf_body always empty at the moment */) 358 1 /* FIXME: opaque lrf_body always empty at the moment */)
344#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 359#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
345 1 + decode_stateid_maxsz) 360 1 + decode_stateid_maxsz)
361#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
362#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz
363#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \
364 XDR_QUADLEN(NFS4_STATEID_SIZE))
365#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1)
366#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \
367 XDR_QUADLEN(NFS4_STATEID_SIZE))
368#define decode_free_stateid_maxsz (op_decode_hdr_maxsz + 1)
346#else /* CONFIG_NFS_V4_1 */ 369#else /* CONFIG_NFS_V4_1 */
347#define encode_sequence_maxsz 0 370#define encode_sequence_maxsz 0
348#define decode_sequence_maxsz 0 371#define decode_sequence_maxsz 0
@@ -740,6 +763,14 @@ static int nfs4_stat_to_errno(int);
740#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 763#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
741 decode_sequence_maxsz + \ 764 decode_sequence_maxsz + \
742 decode_reclaim_complete_maxsz) 765 decode_reclaim_complete_maxsz)
766#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
767 encode_sequence_maxsz + \
768 encode_putfh_maxsz + \
769 encode_getdevicelist_maxsz)
770#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
771 decode_sequence_maxsz + \
772 decode_putfh_maxsz + \
773 decode_getdevicelist_maxsz)
743#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 774#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
744 encode_sequence_maxsz +\ 775 encode_sequence_maxsz +\
745 encode_getdeviceinfo_maxsz) 776 encode_getdeviceinfo_maxsz)
@@ -772,6 +803,26 @@ static int nfs4_stat_to_errno(int);
772 decode_sequence_maxsz + \ 803 decode_sequence_maxsz + \
773 decode_putfh_maxsz + \ 804 decode_putfh_maxsz + \
774 decode_layoutreturn_maxsz) 805 decode_layoutreturn_maxsz)
806#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \
807 encode_sequence_maxsz + \
808 encode_putrootfh_maxsz +\
809 encode_secinfo_no_name_maxsz)
810#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \
811 decode_sequence_maxsz + \
812 decode_putrootfh_maxsz + \
813 decode_secinfo_no_name_maxsz)
814#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \
815 encode_sequence_maxsz + \
816 encode_test_stateid_maxsz)
817#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \
818 decode_sequence_maxsz + \
819 decode_test_stateid_maxsz)
820#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \
821 encode_sequence_maxsz + \
822 encode_free_stateid_maxsz)
823#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \
824 decode_sequence_maxsz + \
825 decode_free_stateid_maxsz)
775 826
776const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 827const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
777 compound_encode_hdr_maxsz + 828 compound_encode_hdr_maxsz +
@@ -1076,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
1076 hdr->replen += decode_getattr_maxsz; 1127 hdr->replen += decode_getattr_maxsz;
1077} 1128}
1078 1129
1130static void
1131encode_getattr_three(struct xdr_stream *xdr,
1132 uint32_t bm0, uint32_t bm1, uint32_t bm2,
1133 struct compound_hdr *hdr)
1134{
1135 __be32 *p;
1136
1137 p = reserve_space(xdr, 4);
1138 *p = cpu_to_be32(OP_GETATTR);
1139 if (bm2) {
1140 p = reserve_space(xdr, 16);
1141 *p++ = cpu_to_be32(3);
1142 *p++ = cpu_to_be32(bm0);
1143 *p++ = cpu_to_be32(bm1);
1144 *p = cpu_to_be32(bm2);
1145 } else if (bm1) {
1146 p = reserve_space(xdr, 12);
1147 *p++ = cpu_to_be32(2);
1148 *p++ = cpu_to_be32(bm0);
1149 *p = cpu_to_be32(bm1);
1150 } else {
1151 p = reserve_space(xdr, 8);
1152 *p++ = cpu_to_be32(1);
1153 *p = cpu_to_be32(bm0);
1154 }
1155 hdr->nops++;
1156 hdr->replen += decode_getattr_maxsz;
1157}
1158
1079static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1159static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1080{ 1160{
1081 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], 1161 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1084,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
1084 1164
1085static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1165static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1086{ 1166{
1087 encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], 1167 encode_getattr_three(xdr,
1088 bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); 1168 bitmask[0] & nfs4_fsinfo_bitmap[0],
1169 bitmask[1] & nfs4_fsinfo_bitmap[1],
1170 bitmask[2] & nfs4_fsinfo_bitmap[2],
1171 hdr);
1089} 1172}
1090 1173
1091static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1174static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1827,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr,
1827 1910
1828#ifdef CONFIG_NFS_V4_1 1911#ifdef CONFIG_NFS_V4_1
1829static void 1912static void
1913encode_getdevicelist(struct xdr_stream *xdr,
1914 const struct nfs4_getdevicelist_args *args,
1915 struct compound_hdr *hdr)
1916{
1917 __be32 *p;
1918 nfs4_verifier dummy = {
1919 .data = "dummmmmy",
1920 };
1921
1922 p = reserve_space(xdr, 20);
1923 *p++ = cpu_to_be32(OP_GETDEVICELIST);
1924 *p++ = cpu_to_be32(args->layoutclass);
1925 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1926 xdr_encode_hyper(p, 0ULL); /* cookie */
1927 encode_nfs4_verifier(xdr, &dummy);
1928 hdr->nops++;
1929 hdr->replen += decode_getdevicelist_maxsz;
1930}
1931
1932static void
1830encode_getdeviceinfo(struct xdr_stream *xdr, 1933encode_getdeviceinfo(struct xdr_stream *xdr,
1831 const struct nfs4_getdeviceinfo_args *args, 1934 const struct nfs4_getdeviceinfo_args *args,
1832 struct compound_hdr *hdr) 1935 struct compound_hdr *hdr)
@@ -1888,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
1888 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1991 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1889 /* Only whole file layouts */ 1992 /* Only whole file layouts */
1890 p = xdr_encode_hyper(p, 0); /* offset */ 1993 p = xdr_encode_hyper(p, 0); /* offset */
1891 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ 1994 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
1892 *p++ = cpu_to_be32(0); /* reclaim */ 1995 *p++ = cpu_to_be32(0); /* reclaim */
1893 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); 1996 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
1894 *p++ = cpu_to_be32(1); /* newoffset = TRUE */ 1997 *p++ = cpu_to_be32(1); /* newoffset = TRUE */
@@ -1938,6 +2041,46 @@ encode_layoutreturn(struct xdr_stream *xdr,
1938 hdr->nops++; 2041 hdr->nops++;
1939 hdr->replen += decode_layoutreturn_maxsz; 2042 hdr->replen += decode_layoutreturn_maxsz;
1940} 2043}
2044
2045static int
2046encode_secinfo_no_name(struct xdr_stream *xdr,
2047 const struct nfs41_secinfo_no_name_args *args,
2048 struct compound_hdr *hdr)
2049{
2050 __be32 *p;
2051 p = reserve_space(xdr, 8);
2052 *p++ = cpu_to_be32(OP_SECINFO_NO_NAME);
2053 *p++ = cpu_to_be32(args->style);
2054 hdr->nops++;
2055 hdr->replen += decode_secinfo_no_name_maxsz;
2056 return 0;
2057}
2058
2059static void encode_test_stateid(struct xdr_stream *xdr,
2060 struct nfs41_test_stateid_args *args,
2061 struct compound_hdr *hdr)
2062{
2063 __be32 *p;
2064
2065 p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE);
2066 *p++ = cpu_to_be32(OP_TEST_STATEID);
2067 *p++ = cpu_to_be32(1);
2068 xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
2069 hdr->nops++;
2070 hdr->replen += decode_test_stateid_maxsz;
2071}
2072
2073static void encode_free_stateid(struct xdr_stream *xdr,
2074 struct nfs41_free_stateid_args *args,
2075 struct compound_hdr *hdr)
2076{
2077 __be32 *p;
2078 p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE);
2079 *p++ = cpu_to_be32(OP_FREE_STATEID);
2080 xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
2081 hdr->nops++;
2082 hdr->replen += decode_free_stateid_maxsz;
2083}
1941#endif /* CONFIG_NFS_V4_1 */ 2084#endif /* CONFIG_NFS_V4_1 */
1942 2085
1943/* 2086/*
@@ -2374,11 +2517,13 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
2374 encode_compound_hdr(xdr, req, &hdr); 2517 encode_compound_hdr(xdr, req, &hdr);
2375 encode_sequence(xdr, &args->seq_args, &hdr); 2518 encode_sequence(xdr, &args->seq_args, &hdr);
2376 encode_putfh(xdr, args->fh, &hdr); 2519 encode_putfh(xdr, args->fh, &hdr);
2377 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; 2520 replen = hdr.replen + op_decode_hdr_maxsz + 1;
2378 encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); 2521 encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
2379 2522
2380 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, 2523 xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
2381 args->acl_pages, args->acl_pgbase, args->acl_len); 2524 args->acl_pages, args->acl_pgbase, args->acl_len);
2525 xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
2526
2382 encode_nops(&hdr); 2527 encode_nops(&hdr);
2383} 2528}
2384 2529
@@ -2536,7 +2681,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
2536 struct compound_hdr hdr = { 2681 struct compound_hdr hdr = {
2537 .nops = 0, 2682 .nops = 0,
2538 }; 2683 };
2539 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2684 const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
2540 2685
2541 encode_compound_hdr(xdr, req, &hdr); 2686 encode_compound_hdr(xdr, req, &hdr);
2542 encode_setclientid_confirm(xdr, arg, &hdr); 2687 encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2680,7 +2825,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
2680 struct compound_hdr hdr = { 2825 struct compound_hdr hdr = {
2681 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), 2826 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
2682 }; 2827 };
2683 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2828 const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
2684 2829
2685 encode_compound_hdr(xdr, req, &hdr); 2830 encode_compound_hdr(xdr, req, &hdr);
2686 encode_sequence(xdr, &args->la_seq_args, &hdr); 2831 encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -2707,6 +2852,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2707} 2852}
2708 2853
2709/* 2854/*
2855 * Encode GETDEVICELIST request
2856 */
2857static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
2858 struct xdr_stream *xdr,
2859 struct nfs4_getdevicelist_args *args)
2860{
2861 struct compound_hdr hdr = {
2862 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2863 };
2864
2865 encode_compound_hdr(xdr, req, &hdr);
2866 encode_sequence(xdr, &args->seq_args, &hdr);
2867 encode_putfh(xdr, args->fh, &hdr);
2868 encode_getdevicelist(xdr, args, &hdr);
2869 encode_nops(&hdr);
2870}
2871
2872/*
2710 * Encode GETDEVICEINFO request 2873 * Encode GETDEVICEINFO request
2711 */ 2874 */
2712static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, 2875static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -2790,6 +2953,59 @@ static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
2790 encode_layoutreturn(xdr, args, &hdr); 2953 encode_layoutreturn(xdr, args, &hdr);
2791 encode_nops(&hdr); 2954 encode_nops(&hdr);
2792} 2955}
2956
2957/*
2958 * Encode SECINFO_NO_NAME request
2959 */
2960static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req,
2961 struct xdr_stream *xdr,
2962 struct nfs41_secinfo_no_name_args *args)
2963{
2964 struct compound_hdr hdr = {
2965 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2966 };
2967
2968 encode_compound_hdr(xdr, req, &hdr);
2969 encode_sequence(xdr, &args->seq_args, &hdr);
2970 encode_putrootfh(xdr, &hdr);
2971 encode_secinfo_no_name(xdr, args, &hdr);
2972 encode_nops(&hdr);
2973 return 0;
2974}
2975
2976/*
2977 * Encode TEST_STATEID request
2978 */
2979static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req,
2980 struct xdr_stream *xdr,
2981 struct nfs41_test_stateid_args *args)
2982{
2983 struct compound_hdr hdr = {
2984 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2985 };
2986
2987 encode_compound_hdr(xdr, req, &hdr);
2988 encode_sequence(xdr, &args->seq_args, &hdr);
2989 encode_test_stateid(xdr, args, &hdr);
2990 encode_nops(&hdr);
2991}
2992
2993/*
2994 * Encode FREE_STATEID request
2995 */
2996static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
2997 struct xdr_stream *xdr,
2998 struct nfs41_free_stateid_args *args)
2999{
3000 struct compound_hdr hdr = {
3001 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
3002 };
3003
3004 encode_compound_hdr(xdr, req, &hdr);
3005 encode_sequence(xdr, &args->seq_args, &hdr);
3006 encode_free_stateid(xdr, args, &hdr);
3007 encode_nops(&hdr);
3008}
2793#endif /* CONFIG_NFS_V4_1 */ 3009#endif /* CONFIG_NFS_V4_1 */
2794 3010
2795static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 3011static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2890,14 +3106,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
2890 goto out_overflow; 3106 goto out_overflow;
2891 bmlen = be32_to_cpup(p); 3107 bmlen = be32_to_cpup(p);
2892 3108
2893 bitmap[0] = bitmap[1] = 0; 3109 bitmap[0] = bitmap[1] = bitmap[2] = 0;
2894 p = xdr_inline_decode(xdr, (bmlen << 2)); 3110 p = xdr_inline_decode(xdr, (bmlen << 2));
2895 if (unlikely(!p)) 3111 if (unlikely(!p))
2896 goto out_overflow; 3112 goto out_overflow;
2897 if (bmlen > 0) { 3113 if (bmlen > 0) {
2898 bitmap[0] = be32_to_cpup(p++); 3114 bitmap[0] = be32_to_cpup(p++);
2899 if (bmlen > 1) 3115 if (bmlen > 1) {
2900 bitmap[1] = be32_to_cpup(p); 3116 bitmap[1] = be32_to_cpup(p++);
3117 if (bmlen > 2)
3118 bitmap[2] = be32_to_cpup(p);
3119 }
2901 } 3120 }
2902 return 0; 3121 return 0;
2903out_overflow: 3122out_overflow:
@@ -2929,8 +3148,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
2929 return ret; 3148 return ret;
2930 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 3149 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
2931 } else 3150 } else
2932 bitmask[0] = bitmask[1] = 0; 3151 bitmask[0] = bitmask[1] = bitmask[2] = 0;
2933 dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); 3152 dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
3153 bitmask[0], bitmask[1], bitmask[2]);
2934 return 0; 3154 return 0;
2935} 3155}
2936 3156
@@ -3984,7 +4204,7 @@ out_overflow:
3984static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 4204static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
3985{ 4205{
3986 __be32 *savep; 4206 __be32 *savep;
3987 uint32_t attrlen, bitmap[2] = {0}; 4207 uint32_t attrlen, bitmap[3] = {0};
3988 int status; 4208 int status;
3989 4209
3990 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4210 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4010,7 +4230,7 @@ xdr_error:
4010static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) 4230static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
4011{ 4231{
4012 __be32 *savep; 4232 __be32 *savep;
4013 uint32_t attrlen, bitmap[2] = {0}; 4233 uint32_t attrlen, bitmap[3] = {0};
4014 int status; 4234 int status;
4015 4235
4016 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4236 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4042,7 +4262,7 @@ xdr_error:
4042static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) 4262static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
4043{ 4263{
4044 __be32 *savep; 4264 __be32 *savep;
4045 uint32_t attrlen, bitmap[2] = {0}; 4265 uint32_t attrlen, bitmap[3] = {0};
4046 int status; 4266 int status;
4047 4267
4048 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4268 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4182,7 +4402,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4182{ 4402{
4183 __be32 *savep; 4403 __be32 *savep;
4184 uint32_t attrlen, 4404 uint32_t attrlen,
4185 bitmap[2] = {0}; 4405 bitmap[3] = {0};
4186 int status; 4406 int status;
4187 4407
4188 status = decode_op_hdr(xdr, OP_GETATTR); 4408 status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4268,10 +4488,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
4268 return status; 4488 return status;
4269} 4489}
4270 4490
4491/*
4492 * The prefered block size for layout directed io
4493 */
4494static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
4495 uint32_t *res)
4496{
4497 __be32 *p;
4498
4499 dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
4500 *res = 0;
4501 if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
4502 p = xdr_inline_decode(xdr, 4);
4503 if (unlikely(!p)) {
4504 print_overflow_msg(__func__, xdr);
4505 return -EIO;
4506 }
4507 *res = be32_to_cpup(p);
4508 bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
4509 }
4510 return 0;
4511}
4512
4271static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4513static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
4272{ 4514{
4273 __be32 *savep; 4515 __be32 *savep;
4274 uint32_t attrlen, bitmap[2]; 4516 uint32_t attrlen, bitmap[3];
4275 int status; 4517 int status;
4276 4518
4277 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4519 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4299,6 +4541,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
4299 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); 4541 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
4300 if (status != 0) 4542 if (status != 0)
4301 goto xdr_error; 4543 goto xdr_error;
4544 status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
4545 if (status)
4546 goto xdr_error;
4302 4547
4303 status = verify_attr_len(xdr, savep, attrlen); 4548 status = verify_attr_len(xdr, savep, attrlen);
4304xdr_error: 4549xdr_error:
@@ -4714,17 +4959,18 @@ decode_restorefh(struct xdr_stream *xdr)
4714} 4959}
4715 4960
4716static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, 4961static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
4717 size_t *acl_len) 4962 struct nfs_getaclres *res)
4718{ 4963{
4719 __be32 *savep; 4964 __be32 *savep, *bm_p;
4720 uint32_t attrlen, 4965 uint32_t attrlen,
4721 bitmap[2] = {0}; 4966 bitmap[3] = {0};
4722 struct kvec *iov = req->rq_rcv_buf.head; 4967 struct kvec *iov = req->rq_rcv_buf.head;
4723 int status; 4968 int status;
4724 4969
4725 *acl_len = 0; 4970 res->acl_len = 0;
4726 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4971 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
4727 goto out; 4972 goto out;
4973 bm_p = xdr->p;
4728 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) 4974 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
4729 goto out; 4975 goto out;
4730 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) 4976 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -4736,18 +4982,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
4736 size_t hdrlen; 4982 size_t hdrlen;
4737 u32 recvd; 4983 u32 recvd;
4738 4984
4985 /* The bitmap (xdr len + bitmaps) and the attr xdr len words
4986 * are stored with the acl data to handle the problem of
4987 * variable length bitmaps.*/
4988 xdr->p = bm_p;
4989 res->acl_data_offset = be32_to_cpup(bm_p) + 2;
4990 res->acl_data_offset <<= 2;
4991
4739 /* We ignore &savep and don't do consistency checks on 4992 /* We ignore &savep and don't do consistency checks on
4740 * the attr length. Let userspace figure it out.... */ 4993 * the attr length. Let userspace figure it out.... */
4741 hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; 4994 hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
4995 attrlen += res->acl_data_offset;
4742 recvd = req->rq_rcv_buf.len - hdrlen; 4996 recvd = req->rq_rcv_buf.len - hdrlen;
4743 if (attrlen > recvd) { 4997 if (attrlen > recvd) {
4744 dprintk("NFS: server cheating in getattr" 4998 if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
4745 " acl reply: attrlen %u > recvd %u\n", 4999 /* getxattr interface called with a NULL buf */
5000 res->acl_len = attrlen;
5001 goto out;
5002 }
5003 dprintk("NFS: acl reply: attrlen %u > recvd %u\n",
4746 attrlen, recvd); 5004 attrlen, recvd);
4747 return -EINVAL; 5005 return -EINVAL;
4748 } 5006 }
4749 xdr_read_pages(xdr, attrlen); 5007 xdr_read_pages(xdr, attrlen);
4750 *acl_len = attrlen; 5008 res->acl_len = attrlen;
4751 } else 5009 } else
4752 status = -EOPNOTSUPP; 5010 status = -EOPNOTSUPP;
4753 5011
@@ -4977,11 +5235,17 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4977 if (unlikely(status)) 5235 if (unlikely(status))
4978 return status; 5236 return status;
4979 5237
4980 /* Throw away server_scope */ 5238 /* Save server_scope */
4981 status = decode_opaque_inline(xdr, &dummy, &dummy_str); 5239 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4982 if (unlikely(status)) 5240 if (unlikely(status))
4983 return status; 5241 return status;
4984 5242
5243 if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
5244 return -EIO;
5245
5246 memcpy(res->server_scope->server_scope, dummy_str, dummy);
5247 res->server_scope->server_scope_sz = dummy;
5248
4985 /* Throw away Implementation id array */ 5249 /* Throw away Implementation id array */
4986 status = decode_opaque_inline(xdr, &dummy, &dummy_str); 5250 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4987 if (unlikely(status)) 5251 if (unlikely(status))
@@ -5141,6 +5405,53 @@ out_overflow:
5141} 5405}
5142 5406
5143#if defined(CONFIG_NFS_V4_1) 5407#if defined(CONFIG_NFS_V4_1)
5408/*
5409 * TODO: Need to handle case when EOF != true;
5410 */
5411static int decode_getdevicelist(struct xdr_stream *xdr,
5412 struct pnfs_devicelist *res)
5413{
5414 __be32 *p;
5415 int status, i;
5416 struct nfs_writeverf verftemp;
5417
5418 status = decode_op_hdr(xdr, OP_GETDEVICELIST);
5419 if (status)
5420 return status;
5421
5422 p = xdr_inline_decode(xdr, 8 + 8 + 4);
5423 if (unlikely(!p))
5424 goto out_overflow;
5425
5426 /* TODO: Skip cookie for now */
5427 p += 2;
5428
5429 /* Read verifier */
5430 p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
5431
5432 res->num_devs = be32_to_cpup(p);
5433
5434 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5435
5436 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5437 printk(KERN_ERR "%s too many result dev_num %u\n",
5438 __func__, res->num_devs);
5439 return -EIO;
5440 }
5441
5442 p = xdr_inline_decode(xdr,
5443 res->num_devs * NFS4_DEVICEID4_SIZE + 4);
5444 if (unlikely(!p))
5445 goto out_overflow;
5446 for (i = 0; i < res->num_devs; i++)
5447 p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
5448 NFS4_DEVICEID4_SIZE);
5449 res->eof = be32_to_cpup(p);
5450 return 0;
5451out_overflow:
5452 print_overflow_msg(__func__, xdr);
5453 return -EIO;
5454}
5144 5455
5145static int decode_getdeviceinfo(struct xdr_stream *xdr, 5456static int decode_getdeviceinfo(struct xdr_stream *xdr,
5146 struct pnfs_device *pdev) 5457 struct pnfs_device *pdev)
@@ -5303,6 +5614,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
5303 int status; 5614 int status;
5304 5615
5305 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); 5616 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
5617 res->status = status;
5306 if (status) 5618 if (status)
5307 return status; 5619 return status;
5308 5620
@@ -5322,6 +5634,55 @@ out_overflow:
5322 print_overflow_msg(__func__, xdr); 5634 print_overflow_msg(__func__, xdr);
5323 return -EIO; 5635 return -EIO;
5324} 5636}
5637
5638static int decode_test_stateid(struct xdr_stream *xdr,
5639 struct nfs41_test_stateid_res *res)
5640{
5641 __be32 *p;
5642 int status;
5643 int num_res;
5644
5645 status = decode_op_hdr(xdr, OP_TEST_STATEID);
5646 if (status)
5647 return status;
5648
5649 p = xdr_inline_decode(xdr, 4);
5650 if (unlikely(!p))
5651 goto out_overflow;
5652 num_res = be32_to_cpup(p++);
5653 if (num_res != 1)
5654 goto out;
5655
5656 p = xdr_inline_decode(xdr, 4);
5657 if (unlikely(!p))
5658 goto out_overflow;
5659 res->status = be32_to_cpup(p++);
5660 return res->status;
5661out_overflow:
5662 print_overflow_msg(__func__, xdr);
5663out:
5664 return -EIO;
5665}
5666
5667static int decode_free_stateid(struct xdr_stream *xdr,
5668 struct nfs41_free_stateid_res *res)
5669{
5670 __be32 *p;
5671 int status;
5672
5673 status = decode_op_hdr(xdr, OP_FREE_STATEID);
5674 if (status)
5675 return status;
5676
5677 p = xdr_inline_decode(xdr, 4);
5678 if (unlikely(!p))
5679 goto out_overflow;
5680 res->status = be32_to_cpup(p++);
5681 return res->status;
5682out_overflow:
5683 print_overflow_msg(__func__, xdr);
5684 return -EIO;
5685}
5325#endif /* CONFIG_NFS_V4_1 */ 5686#endif /* CONFIG_NFS_V4_1 */
5326 5687
5327/* 5688/*
@@ -5682,7 +6043,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5682 status = decode_putfh(xdr); 6043 status = decode_putfh(xdr);
5683 if (status) 6044 if (status)
5684 goto out; 6045 goto out;
5685 status = decode_getacl(xdr, rqstp, &res->acl_len); 6046 status = decode_getacl(xdr, rqstp, res);
5686 6047
5687out: 6048out:
5688 return status; 6049 return status;
@@ -6366,6 +6727,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
6366} 6727}
6367 6728
6368/* 6729/*
6730 * Decode GETDEVICELIST response
6731 */
6732static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
6733 struct xdr_stream *xdr,
6734 struct nfs4_getdevicelist_res *res)
6735{
6736 struct compound_hdr hdr;
6737 int status;
6738
6739 dprintk("encoding getdevicelist!\n");
6740
6741 status = decode_compound_hdr(xdr, &hdr);
6742 if (status != 0)
6743 goto out;
6744 status = decode_sequence(xdr, &res->seq_res, rqstp);
6745 if (status != 0)
6746 goto out;
6747 status = decode_putfh(xdr);
6748 if (status != 0)
6749 goto out;
6750 status = decode_getdevicelist(xdr, res->devlist);
6751out:
6752 return status;
6753}
6754
6755/*
6369 * Decode GETDEVINFO response 6756 * Decode GETDEVINFO response
6370 */ 6757 */
6371static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, 6758static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -6461,6 +6848,72 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
6461out: 6848out:
6462 return status; 6849 return status;
6463} 6850}
6851
6852/*
6853 * Decode SECINFO_NO_NAME response
6854 */
6855static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp,
6856 struct xdr_stream *xdr,
6857 struct nfs4_secinfo_res *res)
6858{
6859 struct compound_hdr hdr;
6860 int status;
6861
6862 status = decode_compound_hdr(xdr, &hdr);
6863 if (status)
6864 goto out;
6865 status = decode_sequence(xdr, &res->seq_res, rqstp);
6866 if (status)
6867 goto out;
6868 status = decode_putrootfh(xdr);
6869 if (status)
6870 goto out;
6871 status = decode_secinfo(xdr, res);
6872out:
6873 return status;
6874}
6875
6876/*
6877 * Decode TEST_STATEID response
6878 */
6879static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp,
6880 struct xdr_stream *xdr,
6881 struct nfs41_test_stateid_res *res)
6882{
6883 struct compound_hdr hdr;
6884 int status;
6885
6886 status = decode_compound_hdr(xdr, &hdr);
6887 if (status)
6888 goto out;
6889 status = decode_sequence(xdr, &res->seq_res, rqstp);
6890 if (status)
6891 goto out;
6892 status = decode_test_stateid(xdr, res);
6893out:
6894 return status;
6895}
6896
6897/*
6898 * Decode FREE_STATEID response
6899 */
6900static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp,
6901 struct xdr_stream *xdr,
6902 struct nfs41_free_stateid_res *res)
6903{
6904 struct compound_hdr hdr;
6905 int status;
6906
6907 status = decode_compound_hdr(xdr, &hdr);
6908 if (status)
6909 goto out;
6910 status = decode_sequence(xdr, &res->seq_res, rqstp);
6911 if (status)
6912 goto out;
6913 status = decode_free_stateid(xdr, res);
6914out:
6915 return status;
6916}
6464#endif /* CONFIG_NFS_V4_1 */ 6917#endif /* CONFIG_NFS_V4_1 */
6465 6918
6466/** 6919/**
@@ -6480,7 +6933,7 @@ out:
6480int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, 6933int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6481 int plus) 6934 int plus)
6482{ 6935{
6483 uint32_t bitmap[2] = {0}; 6936 uint32_t bitmap[3] = {0};
6484 uint32_t len; 6937 uint32_t len;
6485 __be32 *p = xdr_inline_decode(xdr, 4); 6938 __be32 *p = xdr_inline_decode(xdr, 4);
6486 if (unlikely(!p)) 6939 if (unlikely(!p))
@@ -6663,6 +7116,10 @@ struct rpc_procinfo nfs4_procedures[] = {
6663 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 7116 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6664 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), 7117 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
6665 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), 7118 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
7119 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7120 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7121 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7122 PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
6666#endif /* CONFIG_NFS_V4_1 */ 7123#endif /* CONFIG_NFS_V4_1 */
6667}; 7124};
6668 7125
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 8ff2ea3f10e..d0cda12fddc 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)
479 for (i = 0; i < ios->numdevs; i++) { 479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi; 480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or; 481 struct osd_request *or = ios->per_dev[i].or;
482 unsigned dev;
483 int ret; 482 int ret;
484 483
485 if (!or) 484 if (!or)
@@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)
500 499
501 continue; /* we recovered */ 500 continue; /* we recovered */
502 } 501 }
503 dev = ios->per_dev[i].dev; 502 objlayout_io_set_result(&ios->ol_state, i,
504 objlayout_io_set_result(&ios->ol_state, dev, 503 &ios->layout->comps[i].oc_object_id,
505 &ios->layout->comps[dev].oc_object_id,
506 osd_pri_2_pnfs_err(osi.osd_err_pri), 504 osd_pri_2_pnfs_err(osi.osd_err_pri),
507 ios->per_dev[i].offset, 505 ios->per_dev[i].offset,
508 ios->per_dev[i].length, 506 ios->per_dev[i].length,
@@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
589} 587}
590 588
591static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, 589static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
592 unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, 590 unsigned pgbase, struct _objio_per_comp *per_dev, int len,
593 gfp_t gfp_flags) 591 gfp_t gfp_flags)
594{ 592{
595 unsigned pg = *cur_pg; 593 unsigned pg = *cur_pg;
594 int cur_len = len;
596 struct request_queue *q = 595 struct request_queue *q =
597 osd_request_queue(_io_od(ios, per_dev->dev)); 596 osd_request_queue(_io_od(ios, per_dev->dev));
598 597
599 per_dev->length += cur_len;
600
601 if (per_dev->bio == NULL) { 598 if (per_dev->bio == NULL) {
602 unsigned stripes = ios->layout->num_comps / 599 unsigned pages_in_stripe = ios->layout->group_width *
603 ios->layout->mirrors_p1;
604 unsigned pages_in_stripe = stripes *
605 (ios->layout->stripe_unit / PAGE_SIZE); 600 (ios->layout->stripe_unit / PAGE_SIZE);
606 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / 601 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
607 stripes; 602 ios->layout->group_width;
608 603
609 if (BIO_MAX_PAGES_KMALLOC < bio_size) 604 if (BIO_MAX_PAGES_KMALLOC < bio_size)
610 bio_size = BIO_MAX_PAGES_KMALLOC; 605 bio_size = BIO_MAX_PAGES_KMALLOC;
@@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
632 } 627 }
633 BUG_ON(cur_len); 628 BUG_ON(cur_len);
634 629
630 per_dev->length += len;
635 *cur_pg = pg; 631 *cur_pg = pg;
636 return 0; 632 return 0;
637} 633}
@@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
650 int ret = 0; 646 int ret = 0;
651 647
652 while (length) { 648 while (length) {
653 struct _objio_per_comp *per_dev = &ios->per_dev[dev]; 649 struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
654 unsigned cur_len, page_off = 0; 650 unsigned cur_len, page_off = 0;
655 651
656 if (!per_dev->length) { 652 if (!per_dev->length) {
@@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
670 cur_len = stripe_unit; 666 cur_len = stripe_unit;
671 } 667 }
672 668
673 if (max_comp < dev) 669 if (max_comp < dev - first_dev)
674 max_comp = dev; 670 max_comp = dev - first_dev;
675 } else { 671 } else {
676 cur_len = stripe_unit; 672 cur_len = stripe_unit;
677 } 673 }
@@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
806 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; 802 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
807 unsigned dev = per_dev->dev; 803 unsigned dev = per_dev->dev;
808 struct pnfs_osd_object_cred *cred = 804 struct pnfs_osd_object_cred *cred =
809 &ios->layout->comps[dev]; 805 &ios->layout->comps[cur_comp];
810 struct osd_obj_id obj = { 806 struct osd_obj_id obj = {
811 .partition = cred->oc_object_id.oid_partition_id, 807 .partition = cred->oc_object_id.oid_partition_id,
812 .id = cred->oc_object_id.oid_object_id, 808 .id = cred->oc_object_id.oid_object_id,
@@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
904 for (; cur_comp < last_comp; ++cur_comp, ++dev) { 900 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
905 struct osd_request *or = NULL; 901 struct osd_request *or = NULL;
906 struct pnfs_osd_object_cred *cred = 902 struct pnfs_osd_object_cred *cred =
907 &ios->layout->comps[dev]; 903 &ios->layout->comps[cur_comp];
908 struct osd_obj_id obj = { 904 struct osd_obj_id obj = {
909 .partition = cred->oc_object_id.oid_partition_id, 905 .partition = cred->oc_object_id.oid_partition_id,
910 .id = cred->oc_object_id.oid_object_id, 906 .id = cred->oc_object_id.oid_object_id,
@@ -1000,13 +996,22 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
1000 if (!pnfs_generic_pg_test(pgio, prev, req)) 996 if (!pnfs_generic_pg_test(pgio, prev, req))
1001 return false; 997 return false;
1002 998
1003 if (pgio->pg_lseg == NULL)
1004 return true;
1005
1006 return pgio->pg_count + req->wb_bytes <= 999 return pgio->pg_count + req->wb_bytes <=
1007 OBJIO_LSEG(pgio->pg_lseg)->max_io_size; 1000 OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
1008} 1001}
1009 1002
1003static const struct nfs_pageio_ops objio_pg_read_ops = {
1004 .pg_init = pnfs_generic_pg_init_read,
1005 .pg_test = objio_pg_test,
1006 .pg_doio = pnfs_generic_pg_readpages,
1007};
1008
1009static const struct nfs_pageio_ops objio_pg_write_ops = {
1010 .pg_init = pnfs_generic_pg_init_write,
1011 .pg_test = objio_pg_test,
1012 .pg_doio = pnfs_generic_pg_writepages,
1013};
1014
1010static struct pnfs_layoutdriver_type objlayout_type = { 1015static struct pnfs_layoutdriver_type objlayout_type = {
1011 .id = LAYOUT_OSD2_OBJECTS, 1016 .id = LAYOUT_OSD2_OBJECTS,
1012 .name = "LAYOUT_OSD2_OBJECTS", 1017 .name = "LAYOUT_OSD2_OBJECTS",
@@ -1020,7 +1025,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
1020 1025
1021 .read_pagelist = objlayout_read_pagelist, 1026 .read_pagelist = objlayout_read_pagelist,
1022 .write_pagelist = objlayout_write_pagelist, 1027 .write_pagelist = objlayout_write_pagelist,
1023 .pg_test = objio_pg_test, 1028 .pg_read_ops = &objio_pg_read_ops,
1029 .pg_write_ops = &objio_pg_write_ops,
1024 1030
1025 .free_deviceid_node = objio_free_deviceid_node, 1031 .free_deviceid_node = objio_free_deviceid_node,
1026 1032
@@ -1055,5 +1061,7 @@ objlayout_exit(void)
1055 __func__); 1061 __func__);
1056} 1062}
1057 1063
1064MODULE_ALIAS("nfs-layouttype4-2");
1065
1058module_init(objlayout_init); 1066module_init(objlayout_init);
1059module_exit(objlayout_exit); 1067module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index 16fc758e912..b3918f7ac34 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
170 p = _osd_xdr_decode_data_map(p, &layout->olo_map); 170 p = _osd_xdr_decode_data_map(p, &layout->olo_map);
171 layout->olo_comps_index = be32_to_cpup(p++); 171 layout->olo_comps_index = be32_to_cpup(p++);
172 layout->olo_num_comps = be32_to_cpup(p++); 172 layout->olo_num_comps = be32_to_cpup(p++);
173 dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
174 layout->olo_comps_index, layout->olo_num_comps);
175
173 iter->total_comps = layout->olo_num_comps; 176 iter->total_comps = layout->olo_num_comps;
174 return 0; 177 return 0;
175} 178}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 00985571628..b60970cc7f1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -114,7 +114,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
114 if (!nfs_lock_request_dontget(req)) 114 if (!nfs_lock_request_dontget(req))
115 return 0; 115 return 0;
116 if (test_bit(PG_MAPPED, &req->wb_flags)) 116 if (test_bit(PG_MAPPED, &req->wb_flags))
117 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 117 radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
118 return 1; 118 return 1;
119} 119}
120 120
@@ -124,7 +124,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
124void nfs_clear_page_tag_locked(struct nfs_page *req) 124void nfs_clear_page_tag_locked(struct nfs_page *req)
125{ 125{
126 if (test_bit(PG_MAPPED, &req->wb_flags)) { 126 if (test_bit(PG_MAPPED, &req->wb_flags)) {
127 struct inode *inode = req->wb_context->path.dentry->d_inode; 127 struct inode *inode = req->wb_context->dentry->d_inode;
128 struct nfs_inode *nfsi = NFS_I(inode); 128 struct nfs_inode *nfsi = NFS_I(inode);
129 129
130 spin_lock(&inode->i_lock); 130 spin_lock(&inode->i_lock);
@@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
230 */ 230 */
231void nfs_pageio_init(struct nfs_pageio_descriptor *desc, 231void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
232 struct inode *inode, 232 struct inode *inode,
233 int (*doio)(struct nfs_pageio_descriptor *), 233 const struct nfs_pageio_ops *pg_ops,
234 size_t bsize, 234 size_t bsize,
235 int io_flags) 235 int io_flags)
236{ 236{
@@ -240,13 +240,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
240 desc->pg_bsize = bsize; 240 desc->pg_bsize = bsize;
241 desc->pg_base = 0; 241 desc->pg_base = 0;
242 desc->pg_moreio = 0; 242 desc->pg_moreio = 0;
243 desc->pg_recoalesce = 0;
243 desc->pg_inode = inode; 244 desc->pg_inode = inode;
244 desc->pg_doio = doio; 245 desc->pg_ops = pg_ops;
245 desc->pg_ioflags = io_flags; 246 desc->pg_ioflags = io_flags;
246 desc->pg_error = 0; 247 desc->pg_error = 0;
247 desc->pg_lseg = NULL; 248 desc->pg_lseg = NULL;
248 desc->pg_test = nfs_generic_pg_test;
249 pnfs_pageio_init(desc, inode);
250} 249}
251 250
252/** 251/**
@@ -276,7 +275,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
276 return false; 275 return false;
277 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 276 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
278 return false; 277 return false;
279 return pgio->pg_test(pgio, prev, req); 278 return pgio->pg_ops->pg_test(pgio, prev, req);
280} 279}
281 280
282/** 281/**
@@ -297,6 +296,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
297 if (!nfs_can_coalesce_requests(prev, req, desc)) 296 if (!nfs_can_coalesce_requests(prev, req, desc))
298 return 0; 297 return 0;
299 } else { 298 } else {
299 if (desc->pg_ops->pg_init)
300 desc->pg_ops->pg_init(desc, req);
300 desc->pg_base = req->wb_pgbase; 301 desc->pg_base = req->wb_pgbase;
301 } 302 }
302 nfs_list_remove_request(req); 303 nfs_list_remove_request(req);
@@ -311,7 +312,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
311static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 312static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
312{ 313{
313 if (!list_empty(&desc->pg_list)) { 314 if (!list_empty(&desc->pg_list)) {
314 int error = desc->pg_doio(desc); 315 int error = desc->pg_ops->pg_doio(desc);
315 if (error < 0) 316 if (error < 0)
316 desc->pg_error = error; 317 desc->pg_error = error;
317 else 318 else
@@ -331,7 +332,7 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
331 * Returns true if the request 'req' was successfully coalesced into the 332 * Returns true if the request 'req' was successfully coalesced into the
332 * existing list of pages 'desc'. 333 * existing list of pages 'desc'.
333 */ 334 */
334int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 335static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
335 struct nfs_page *req) 336 struct nfs_page *req)
336{ 337{
337 while (!nfs_pageio_do_add_request(desc, req)) { 338 while (!nfs_pageio_do_add_request(desc, req)) {
@@ -340,17 +341,67 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
340 if (desc->pg_error < 0) 341 if (desc->pg_error < 0)
341 return 0; 342 return 0;
342 desc->pg_moreio = 0; 343 desc->pg_moreio = 0;
344 if (desc->pg_recoalesce)
345 return 0;
343 } 346 }
344 return 1; 347 return 1;
345} 348}
346 349
350static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
351{
352 LIST_HEAD(head);
353
354 do {
355 list_splice_init(&desc->pg_list, &head);
356 desc->pg_bytes_written -= desc->pg_count;
357 desc->pg_count = 0;
358 desc->pg_base = 0;
359 desc->pg_recoalesce = 0;
360
361 while (!list_empty(&head)) {
362 struct nfs_page *req;
363
364 req = list_first_entry(&head, struct nfs_page, wb_list);
365 nfs_list_remove_request(req);
366 if (__nfs_pageio_add_request(desc, req))
367 continue;
368 if (desc->pg_error < 0)
369 return 0;
370 break;
371 }
372 } while (desc->pg_recoalesce);
373 return 1;
374}
375
376int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
377 struct nfs_page *req)
378{
379 int ret;
380
381 do {
382 ret = __nfs_pageio_add_request(desc, req);
383 if (ret)
384 break;
385 if (desc->pg_error < 0)
386 break;
387 ret = nfs_do_recoalesce(desc);
388 } while (ret);
389 return ret;
390}
391
347/** 392/**
348 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 393 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
349 * @desc: pointer to io descriptor 394 * @desc: pointer to io descriptor
350 */ 395 */
351void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) 396void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
352{ 397{
353 nfs_pageio_doio(desc); 398 for (;;) {
399 nfs_pageio_doio(desc);
400 if (!desc->pg_recoalesce)
401 break;
402 if (!nfs_do_recoalesce(desc))
403 break;
404 }
354} 405}
355 406
356/** 407/**
@@ -369,7 +420,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
369 if (!list_empty(&desc->pg_list)) { 420 if (!list_empty(&desc->pg_list)) {
370 struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); 421 struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
371 if (index != prev->wb_index + 1) 422 if (index != prev->wb_index + 1)
372 nfs_pageio_doio(desc); 423 nfs_pageio_complete(desc);
373 } 424 }
374} 425}
375 426
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 29c0ca7fc34..ee73d9a4f70 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -28,6 +28,7 @@
28 */ 28 */
29 29
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include <linux/nfs_page.h>
31#include "internal.h" 32#include "internal.h"
32#include "pnfs.h" 33#include "pnfs.h"
33#include "iostat.h" 34#include "iostat.h"
@@ -75,8 +76,11 @@ find_pnfs_driver(u32 id)
75void 76void
76unset_pnfs_layoutdriver(struct nfs_server *nfss) 77unset_pnfs_layoutdriver(struct nfs_server *nfss)
77{ 78{
78 if (nfss->pnfs_curr_ld) 79 if (nfss->pnfs_curr_ld) {
80 if (nfss->pnfs_curr_ld->clear_layoutdriver)
81 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner); 82 module_put(nfss->pnfs_curr_ld->owner);
83 }
80 nfss->pnfs_curr_ld = NULL; 84 nfss->pnfs_curr_ld = NULL;
81} 85}
82 86
@@ -87,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
87 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 91 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
88 */ 92 */
89void 93void
90set_pnfs_layoutdriver(struct nfs_server *server, u32 id) 94set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
95 u32 id)
91{ 96{
92 struct pnfs_layoutdriver_type *ld_type = NULL; 97 struct pnfs_layoutdriver_type *ld_type = NULL;
93 98
@@ -114,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
114 goto out_no_driver; 119 goto out_no_driver;
115 } 120 }
116 server->pnfs_curr_ld = ld_type; 121 server->pnfs_curr_ld = ld_type;
122 if (ld_type->set_layoutdriver
123 && ld_type->set_layoutdriver(server, mntfh)) {
124 printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
125 __func__, id);
126 module_put(ld_type->owner);
127 goto out_no_driver;
128 }
117 129
118 dprintk("%s: pNFS module for %u set\n", __func__, id); 130 dprintk("%s: pNFS module for %u set\n", __func__, id);
119 return; 131 return;
@@ -189,6 +201,7 @@ static void
189pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 201pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
190{ 202{
191 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; 203 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
204 put_rpccred(lo->plh_lc_cred);
192 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); 205 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
193} 206}
194 207
@@ -223,6 +236,7 @@ static void
223init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 236init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
224{ 237{
225 INIT_LIST_HEAD(&lseg->pls_list); 238 INIT_LIST_HEAD(&lseg->pls_list);
239 INIT_LIST_HEAD(&lseg->pls_lc_list);
226 atomic_set(&lseg->pls_refcount, 1); 240 atomic_set(&lseg->pls_refcount, 1);
227 smp_mb(); 241 smp_mb();
228 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 242 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
@@ -448,11 +462,20 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
448void 462void
449pnfs_destroy_all_layouts(struct nfs_client *clp) 463pnfs_destroy_all_layouts(struct nfs_client *clp)
450{ 464{
465 struct nfs_server *server;
451 struct pnfs_layout_hdr *lo; 466 struct pnfs_layout_hdr *lo;
452 LIST_HEAD(tmp_list); 467 LIST_HEAD(tmp_list);
453 468
469 nfs4_deviceid_mark_client_invalid(clp);
470 nfs4_deviceid_purge_client(clp);
471
454 spin_lock(&clp->cl_lock); 472 spin_lock(&clp->cl_lock);
455 list_splice_init(&clp->cl_layouts, &tmp_list); 473 rcu_read_lock();
474 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
475 if (!list_empty(&server->layouts))
476 list_splice_init(&server->layouts, &tmp_list);
477 }
478 rcu_read_unlock();
456 spin_unlock(&clp->cl_lock); 479 spin_unlock(&clp->cl_lock);
457 480
458 while (!list_empty(&tmp_list)) { 481 while (!list_empty(&tmp_list)) {
@@ -661,6 +684,7 @@ _pnfs_return_layout(struct inode *ino)
661 lrp->args.stateid = stateid; 684 lrp->args.stateid = stateid;
662 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; 685 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
663 lrp->args.inode = ino; 686 lrp->args.inode = ino;
687 lrp->args.layout = lo;
664 lrp->clp = NFS_SERVER(ino)->nfs_client; 688 lrp->clp = NFS_SERVER(ino)->nfs_client;
665 689
666 status = nfs4_proc_layoutreturn(lrp); 690 status = nfs4_proc_layoutreturn(lrp);
@@ -805,7 +829,9 @@ out:
805} 829}
806 830
807static struct pnfs_layout_hdr * 831static struct pnfs_layout_hdr *
808alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) 832alloc_init_layout_hdr(struct inode *ino,
833 struct nfs_open_context *ctx,
834 gfp_t gfp_flags)
809{ 835{
810 struct pnfs_layout_hdr *lo; 836 struct pnfs_layout_hdr *lo;
811 837
@@ -817,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
817 INIT_LIST_HEAD(&lo->plh_segs); 843 INIT_LIST_HEAD(&lo->plh_segs);
818 INIT_LIST_HEAD(&lo->plh_bulk_recall); 844 INIT_LIST_HEAD(&lo->plh_bulk_recall);
819 lo->plh_inode = ino; 845 lo->plh_inode = ino;
846 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
820 return lo; 847 return lo;
821} 848}
822 849
823static struct pnfs_layout_hdr * 850static struct pnfs_layout_hdr *
824pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) 851pnfs_find_alloc_layout(struct inode *ino,
852 struct nfs_open_context *ctx,
853 gfp_t gfp_flags)
825{ 854{
826 struct nfs_inode *nfsi = NFS_I(ino); 855 struct nfs_inode *nfsi = NFS_I(ino);
827 struct pnfs_layout_hdr *new = NULL; 856 struct pnfs_layout_hdr *new = NULL;
@@ -836,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
836 return nfsi->layout; 865 return nfsi->layout;
837 } 866 }
838 spin_unlock(&ino->i_lock); 867 spin_unlock(&ino->i_lock);
839 new = alloc_init_layout_hdr(ino, gfp_flags); 868 new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
840 spin_lock(&ino->i_lock); 869 spin_lock(&ino->i_lock);
841 870
842 if (likely(nfsi->layout == NULL)) /* Won the race? */ 871 if (likely(nfsi->layout == NULL)) /* Won the race? */
@@ -920,7 +949,8 @@ pnfs_update_layout(struct inode *ino,
920 }; 949 };
921 unsigned pg_offset; 950 unsigned pg_offset;
922 struct nfs_inode *nfsi = NFS_I(ino); 951 struct nfs_inode *nfsi = NFS_I(ino);
923 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 952 struct nfs_server *server = NFS_SERVER(ino);
953 struct nfs_client *clp = server->nfs_client;
924 struct pnfs_layout_hdr *lo; 954 struct pnfs_layout_hdr *lo;
925 struct pnfs_layout_segment *lseg = NULL; 955 struct pnfs_layout_segment *lseg = NULL;
926 bool first = false; 956 bool first = false;
@@ -928,7 +958,7 @@ pnfs_update_layout(struct inode *ino,
928 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 958 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
929 return NULL; 959 return NULL;
930 spin_lock(&ino->i_lock); 960 spin_lock(&ino->i_lock);
931 lo = pnfs_find_alloc_layout(ino, gfp_flags); 961 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
932 if (lo == NULL) { 962 if (lo == NULL) {
933 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 963 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
934 goto out_unlock; 964 goto out_unlock;
@@ -964,7 +994,7 @@ pnfs_update_layout(struct inode *ino,
964 */ 994 */
965 spin_lock(&clp->cl_lock); 995 spin_lock(&clp->cl_lock);
966 BUG_ON(!list_empty(&lo->plh_layouts)); 996 BUG_ON(!list_empty(&lo->plh_layouts));
967 list_add_tail(&lo->plh_layouts, &clp->cl_layouts); 997 list_add_tail(&lo->plh_layouts, &server->layouts);
968 spin_unlock(&clp->cl_lock); 998 spin_unlock(&clp->cl_lock);
969 } 999 }
970 1000
@@ -973,7 +1003,8 @@ pnfs_update_layout(struct inode *ino,
973 arg.offset -= pg_offset; 1003 arg.offset -= pg_offset;
974 arg.length += pg_offset; 1004 arg.length += pg_offset;
975 } 1005 }
976 arg.length = PAGE_CACHE_ALIGN(arg.length); 1006 if (arg.length != NFS4_MAX_UINT64)
1007 arg.length = PAGE_CACHE_ALIGN(arg.length);
977 1008
978 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1009 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
979 if (!lseg && first) { 1010 if (!lseg && first) {
@@ -991,6 +1022,7 @@ out_unlock:
991 spin_unlock(&ino->i_lock); 1022 spin_unlock(&ino->i_lock);
992 goto out; 1023 goto out;
993} 1024}
1025EXPORT_SYMBOL_GPL(pnfs_update_layout);
994 1026
995int 1027int
996pnfs_layout_process(struct nfs4_layoutget *lgp) 1028pnfs_layout_process(struct nfs4_layoutget *lgp)
@@ -1048,35 +1080,71 @@ out_forget_reply:
1048 goto out; 1080 goto out;
1049} 1081}
1050 1082
1083void
1084pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1085{
1086 BUG_ON(pgio->pg_lseg != NULL);
1087
1088 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1089 req->wb_context,
1090 req_offset(req),
1091 req->wb_bytes,
1092 IOMODE_READ,
1093 GFP_KERNEL);
1094 /* If no lseg, fall back to read through mds */
1095 if (pgio->pg_lseg == NULL)
1096 nfs_pageio_reset_read_mds(pgio);
1097
1098}
1099EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1100
1101void
1102pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1103{
1104 BUG_ON(pgio->pg_lseg != NULL);
1105
1106 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1107 req->wb_context,
1108 req_offset(req),
1109 req->wb_bytes,
1110 IOMODE_RW,
1111 GFP_NOFS);
1112 /* If no lseg, fall back to write through mds */
1113 if (pgio->pg_lseg == NULL)
1114 nfs_pageio_reset_write_mds(pgio);
1115}
1116EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1117
1051bool 1118bool
1052pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1119pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
1053 struct nfs_page *req)
1054{ 1120{
1055 enum pnfs_iomode access_type; 1121 struct nfs_server *server = NFS_SERVER(inode);
1056 gfp_t gfp_flags; 1122 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1057 1123
1058 /* We assume that pg_ioflags == 0 iff we're reading a page */ 1124 if (ld == NULL)
1059 if (pgio->pg_ioflags == 0) { 1125 return false;
1060 access_type = IOMODE_READ; 1126 nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0);
1061 gfp_flags = GFP_KERNEL; 1127 return true;
1062 } else { 1128}
1063 access_type = IOMODE_RW;
1064 gfp_flags = GFP_NOFS;
1065 }
1066 1129
1067 if (pgio->pg_lseg == NULL) { 1130bool
1068 if (pgio->pg_count != prev->wb_bytes) 1131pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
1069 return true; 1132{
1070 /* This is first coelesce call for a series of nfs_pages */ 1133 struct nfs_server *server = NFS_SERVER(inode);
1071 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1134 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1072 prev->wb_context, 1135
1073 req_offset(prev), 1136 if (ld == NULL)
1074 pgio->pg_count, 1137 return false;
1075 access_type, 1138 nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags);
1076 gfp_flags); 1139 return true;
1077 if (pgio->pg_lseg == NULL) 1140}
1078 return true; 1141
1079 } 1142bool
1143pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1144 struct nfs_page *req)
1145{
1146 if (pgio->pg_lseg == NULL)
1147 return nfs_generic_pg_test(pgio, prev, req);
1080 1148
1081 /* 1149 /*
1082 * Test if a nfs_page is fully contained in the pnfs_layout_range. 1150 * Test if a nfs_page is fully contained in the pnfs_layout_range.
@@ -1100,35 +1168,44 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1100/* 1168/*
1101 * Called by non rpc-based layout drivers 1169 * Called by non rpc-based layout drivers
1102 */ 1170 */
1103int 1171void pnfs_ld_write_done(struct nfs_write_data *data)
1104pnfs_ld_write_done(struct nfs_write_data *data)
1105{ 1172{
1106 int status; 1173 if (likely(!data->pnfs_error)) {
1107
1108 if (!data->pnfs_error) {
1109 pnfs_set_layoutcommit(data); 1174 pnfs_set_layoutcommit(data);
1110 data->mds_ops->rpc_call_done(&data->task, data); 1175 data->mds_ops->rpc_call_done(&data->task, data);
1111 data->mds_ops->rpc_release(data); 1176 } else {
1112 return 0; 1177 put_lseg(data->lseg);
1178 data->lseg = NULL;
1179 dprintk("pnfs write error = %d\n", data->pnfs_error);
1113 } 1180 }
1114 1181 data->mds_ops->rpc_release(data);
1115 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1116 data->pnfs_error);
1117 status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
1118 data->mds_ops, NFS_FILE_SYNC);
1119 return status ? : -EAGAIN;
1120} 1182}
1121EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1183EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1122 1184
1123enum pnfs_try_status 1185static void
1186pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1187 struct nfs_write_data *data)
1188{
1189 list_splice_tail_init(&data->pages, &desc->pg_list);
1190 if (data->req && list_empty(&data->req->wb_list))
1191 nfs_list_add_request(data->req, &desc->pg_list);
1192 nfs_pageio_reset_write_mds(desc);
1193 desc->pg_recoalesce = 1;
1194 nfs_writedata_release(data);
1195}
1196
1197static enum pnfs_try_status
1124pnfs_try_to_write_data(struct nfs_write_data *wdata, 1198pnfs_try_to_write_data(struct nfs_write_data *wdata,
1125 const struct rpc_call_ops *call_ops, int how) 1199 const struct rpc_call_ops *call_ops,
1200 struct pnfs_layout_segment *lseg,
1201 int how)
1126{ 1202{
1127 struct inode *inode = wdata->inode; 1203 struct inode *inode = wdata->inode;
1128 enum pnfs_try_status trypnfs; 1204 enum pnfs_try_status trypnfs;
1129 struct nfs_server *nfss = NFS_SERVER(inode); 1205 struct nfs_server *nfss = NFS_SERVER(inode);
1130 1206
1131 wdata->mds_ops = call_ops; 1207 wdata->mds_ops = call_ops;
1208 wdata->lseg = get_lseg(lseg);
1132 1209
1133 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1210 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
1134 inode->i_ino, wdata->args.count, wdata->args.offset, how); 1211 inode->i_ino, wdata->args.count, wdata->args.offset, how);
@@ -1144,41 +1221,87 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
1144 return trypnfs; 1221 return trypnfs;
1145} 1222}
1146 1223
1224static void
1225pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
1226{
1227 struct nfs_write_data *data;
1228 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1229 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1230
1231 desc->pg_lseg = NULL;
1232 while (!list_empty(head)) {
1233 enum pnfs_try_status trypnfs;
1234
1235 data = list_entry(head->next, struct nfs_write_data, list);
1236 list_del_init(&data->list);
1237
1238 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
1239 if (trypnfs == PNFS_NOT_ATTEMPTED)
1240 pnfs_write_through_mds(desc, data);
1241 }
1242 put_lseg(lseg);
1243}
1244
1245int
1246pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1247{
1248 LIST_HEAD(head);
1249 int ret;
1250
1251 ret = nfs_generic_flush(desc, &head);
1252 if (ret != 0) {
1253 put_lseg(desc->pg_lseg);
1254 desc->pg_lseg = NULL;
1255 return ret;
1256 }
1257 pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags);
1258 return 0;
1259}
1260EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1261
1147/* 1262/*
1148 * Called by non rpc-based layout drivers 1263 * Called by non rpc-based layout drivers
1149 */ 1264 */
1150int 1265void pnfs_ld_read_done(struct nfs_read_data *data)
1151pnfs_ld_read_done(struct nfs_read_data *data)
1152{ 1266{
1153 int status; 1267 if (likely(!data->pnfs_error)) {
1154
1155 if (!data->pnfs_error) {
1156 __nfs4_read_done_cb(data); 1268 __nfs4_read_done_cb(data);
1157 data->mds_ops->rpc_call_done(&data->task, data); 1269 data->mds_ops->rpc_call_done(&data->task, data);
1158 data->mds_ops->rpc_release(data); 1270 } else {
1159 return 0; 1271 put_lseg(data->lseg);
1272 data->lseg = NULL;
1273 dprintk("pnfs write error = %d\n", data->pnfs_error);
1160 } 1274 }
1161 1275 data->mds_ops->rpc_release(data);
1162 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1163 data->pnfs_error);
1164 status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
1165 data->mds_ops);
1166 return status ? : -EAGAIN;
1167} 1276}
1168EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1277EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1169 1278
1279static void
1280pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1281 struct nfs_read_data *data)
1282{
1283 list_splice_tail_init(&data->pages, &desc->pg_list);
1284 if (data->req && list_empty(&data->req->wb_list))
1285 nfs_list_add_request(data->req, &desc->pg_list);
1286 nfs_pageio_reset_read_mds(desc);
1287 desc->pg_recoalesce = 1;
1288 nfs_readdata_release(data);
1289}
1290
1170/* 1291/*
1171 * Call the appropriate parallel I/O subsystem read function. 1292 * Call the appropriate parallel I/O subsystem read function.
1172 */ 1293 */
1173enum pnfs_try_status 1294static enum pnfs_try_status
1174pnfs_try_to_read_data(struct nfs_read_data *rdata, 1295pnfs_try_to_read_data(struct nfs_read_data *rdata,
1175 const struct rpc_call_ops *call_ops) 1296 const struct rpc_call_ops *call_ops,
1297 struct pnfs_layout_segment *lseg)
1176{ 1298{
1177 struct inode *inode = rdata->inode; 1299 struct inode *inode = rdata->inode;
1178 struct nfs_server *nfss = NFS_SERVER(inode); 1300 struct nfs_server *nfss = NFS_SERVER(inode);
1179 enum pnfs_try_status trypnfs; 1301 enum pnfs_try_status trypnfs;
1180 1302
1181 rdata->mds_ops = call_ops; 1303 rdata->mds_ops = call_ops;
1304 rdata->lseg = get_lseg(lseg);
1182 1305
1183 dprintk("%s: Reading ino:%lu %u@%llu\n", 1306 dprintk("%s: Reading ino:%lu %u@%llu\n",
1184 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 1307 __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
@@ -1194,19 +1317,70 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
1194 return trypnfs; 1317 return trypnfs;
1195} 1318}
1196 1319
1320static void
1321pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
1322{
1323 struct nfs_read_data *data;
1324 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1325 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1326
1327 desc->pg_lseg = NULL;
1328 while (!list_empty(head)) {
1329 enum pnfs_try_status trypnfs;
1330
1331 data = list_entry(head->next, struct nfs_read_data, list);
1332 list_del_init(&data->list);
1333
1334 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
1335 if (trypnfs == PNFS_NOT_ATTEMPTED)
1336 pnfs_read_through_mds(desc, data);
1337 }
1338 put_lseg(lseg);
1339}
1340
1341int
1342pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1343{
1344 LIST_HEAD(head);
1345 int ret;
1346
1347 ret = nfs_generic_pagein(desc, &head);
1348 if (ret != 0) {
1349 put_lseg(desc->pg_lseg);
1350 desc->pg_lseg = NULL;
1351 return ret;
1352 }
1353 pnfs_do_multiple_reads(desc, &head);
1354 return 0;
1355}
1356EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
1357
1197/* 1358/*
1198 * Currently there is only one (whole file) write lseg. 1359 * There can be multiple RW segments.
1199 */ 1360 */
1200static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) 1361static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1201{ 1362{
1202 struct pnfs_layout_segment *lseg, *rv = NULL; 1363 struct pnfs_layout_segment *lseg;
1203 1364
1204 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) 1365 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
1205 if (lseg->pls_range.iomode == IOMODE_RW) 1366 if (lseg->pls_range.iomode == IOMODE_RW &&
1206 rv = lseg; 1367 test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1207 return rv; 1368 list_add(&lseg->pls_lc_list, listp);
1369 }
1208} 1370}
1209 1371
1372void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1373{
1374 if (lseg->pls_range.iomode == IOMODE_RW) {
1375 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
1376 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
1377 } else {
1378 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
1379 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
1380 }
1381}
1382EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1383
1210void 1384void
1211pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1385pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1212{ 1386{
@@ -1216,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1216 1390
1217 spin_lock(&nfsi->vfs_inode.i_lock); 1391 spin_lock(&nfsi->vfs_inode.i_lock);
1218 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1392 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1219 /* references matched in nfs4_layoutcommit_release */
1220 get_lseg(wdata->lseg);
1221 wdata->lseg->pls_lc_cred =
1222 get_rpccred(wdata->args.context->state->owner->so_cred);
1223 mark_as_dirty = true; 1393 mark_as_dirty = true;
1224 dprintk("%s: Set layoutcommit for inode %lu ", 1394 dprintk("%s: Set layoutcommit for inode %lu ",
1225 __func__, wdata->inode->i_ino); 1395 __func__, wdata->inode->i_ino);
1226 } 1396 }
1227 if (end_pos > wdata->lseg->pls_end_pos) 1397 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
1228 wdata->lseg->pls_end_pos = end_pos; 1398 /* references matched in nfs4_layoutcommit_release */
1399 get_lseg(wdata->lseg);
1400 }
1401 if (end_pos > nfsi->layout->plh_lwb)
1402 nfsi->layout->plh_lwb = end_pos;
1229 spin_unlock(&nfsi->vfs_inode.i_lock); 1403 spin_unlock(&nfsi->vfs_inode.i_lock);
1404 dprintk("%s: lseg %p end_pos %llu\n",
1405 __func__, wdata->lseg, nfsi->layout->plh_lwb);
1230 1406
1231 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1407 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1232 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 1408 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
@@ -1235,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1235} 1411}
1236EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1412EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1237 1413
1414void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1415{
1416 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
1417
1418 if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
1419 nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
1420}
1421
1238/* 1422/*
1239 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 1423 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
1240 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 1424 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
@@ -1248,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1248{ 1432{
1249 struct nfs4_layoutcommit_data *data; 1433 struct nfs4_layoutcommit_data *data;
1250 struct nfs_inode *nfsi = NFS_I(inode); 1434 struct nfs_inode *nfsi = NFS_I(inode);
1251 struct pnfs_layout_segment *lseg;
1252 struct rpc_cred *cred;
1253 loff_t end_pos; 1435 loff_t end_pos;
1254 int status = 0; 1436 int status = 0;
1255 1437
@@ -1266,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1266 goto out; 1448 goto out;
1267 } 1449 }
1268 1450
1451 INIT_LIST_HEAD(&data->lseg_list);
1269 spin_lock(&inode->i_lock); 1452 spin_lock(&inode->i_lock);
1270 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1453 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1271 spin_unlock(&inode->i_lock); 1454 spin_unlock(&inode->i_lock);
1272 kfree(data); 1455 kfree(data);
1273 goto out; 1456 goto out;
1274 } 1457 }
1275 /*
1276 * Currently only one (whole file) write lseg which is referenced
1277 * in pnfs_set_layoutcommit and will be found.
1278 */
1279 lseg = pnfs_list_write_lseg(inode);
1280 1458
1281 end_pos = lseg->pls_end_pos; 1459 pnfs_list_write_lseg(inode, &data->lseg_list);
1282 cred = lseg->pls_lc_cred; 1460
1283 lseg->pls_end_pos = 0; 1461 end_pos = nfsi->layout->plh_lwb;
1284 lseg->pls_lc_cred = NULL; 1462 nfsi->layout->plh_lwb = 0;
1285 1463
1286 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, 1464 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
1287 sizeof(nfsi->layout->plh_stateid.data)); 1465 sizeof(nfsi->layout->plh_stateid.data));
1288 spin_unlock(&inode->i_lock); 1466 spin_unlock(&inode->i_lock);
1289 1467
1290 data->args.inode = inode; 1468 data->args.inode = inode;
1291 data->lseg = lseg; 1469 data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
1292 data->cred = cred;
1293 nfs_fattr_init(&data->fattr); 1470 nfs_fattr_init(&data->fattr);
1294 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1471 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1295 data->res.fattr = &data->fattr; 1472 data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 96bf4e6f45b..1509530cb11 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,16 +36,16 @@
36enum { 36enum {
37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
38 NFS_LSEG_ROC, /* roc bit received from server */ 38 NFS_LSEG_ROC, /* roc bit received from server */
39 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
39}; 40};
40 41
41struct pnfs_layout_segment { 42struct pnfs_layout_segment {
42 struct list_head pls_list; 43 struct list_head pls_list;
44 struct list_head pls_lc_list;
43 struct pnfs_layout_range pls_range; 45 struct pnfs_layout_range pls_range;
44 atomic_t pls_refcount; 46 atomic_t pls_refcount;
45 unsigned long pls_flags; 47 unsigned long pls_flags;
46 struct pnfs_layout_hdr *pls_layout; 48 struct pnfs_layout_hdr *pls_layout;
47 struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
48 loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
49}; 49};
50 50
51enum pnfs_try_status { 51enum pnfs_try_status {
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {
80 struct module *owner; 80 struct module *owner;
81 unsigned flags; 81 unsigned flags;
82 82
83 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
84 int (*clear_layoutdriver) (struct nfs_server *);
85
83 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); 86 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
84 void (*free_layout_hdr) (struct pnfs_layout_hdr *); 87 void (*free_layout_hdr) (struct pnfs_layout_hdr *);
85 88
@@ -87,7 +90,8 @@ struct pnfs_layoutdriver_type {
87 void (*free_lseg) (struct pnfs_layout_segment *lseg); 90 void (*free_lseg) (struct pnfs_layout_segment *lseg);
88 91
89 /* test for nfs page cache coalescing */ 92 /* test for nfs page cache coalescing */
90 bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); 93 const struct nfs_pageio_ops *pg_read_ops;
94 const struct nfs_pageio_ops *pg_write_ops;
91 95
92 /* Returns true if layoutdriver wants to divert this request to 96 /* Returns true if layoutdriver wants to divert this request to
93 * driver's commit routine. 97 * driver's commit routine.
@@ -109,6 +113,8 @@ struct pnfs_layoutdriver_type {
109 struct xdr_stream *xdr, 113 struct xdr_stream *xdr,
110 const struct nfs4_layoutreturn_args *args); 114 const struct nfs4_layoutreturn_args *args);
111 115
116 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
117
112 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 118 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
113 struct xdr_stream *xdr, 119 struct xdr_stream *xdr,
114 const struct nfs4_layoutcommit_args *args); 120 const struct nfs4_layoutcommit_args *args);
@@ -124,6 +130,8 @@ struct pnfs_layout_hdr {
124 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ 130 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
125 u32 plh_barrier; /* ignore lower seqids */ 131 u32 plh_barrier; /* ignore lower seqids */
126 unsigned long plh_flags; 132 unsigned long plh_flags;
133 loff_t plh_lwb; /* last write byte for layoutcommit */
134 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
127 struct inode *plh_inode; 135 struct inode *plh_inode;
128}; 136};
129 137
@@ -136,10 +144,21 @@ struct pnfs_device {
136 unsigned int pglen; 144 unsigned int pglen;
137}; 145};
138 146
147#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
148
149struct pnfs_devicelist {
150 unsigned int eof;
151 unsigned int num_devs;
152 struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
153};
154
139extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); 155extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
140extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 156extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
141 157
142/* nfs4proc.c */ 158/* nfs4proc.c */
159extern int nfs4_proc_getdevicelist(struct nfs_server *server,
160 const struct nfs_fh *fh,
161 struct pnfs_devicelist *devlist);
143extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 162extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
144 struct pnfs_device *dev); 163 struct pnfs_device *dev);
145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 164extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
@@ -148,17 +167,18 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
148/* pnfs.c */ 167/* pnfs.c */
149void get_layout_hdr(struct pnfs_layout_hdr *lo); 168void get_layout_hdr(struct pnfs_layout_hdr *lo);
150void put_lseg(struct pnfs_layout_segment *lseg); 169void put_lseg(struct pnfs_layout_segment *lseg);
151struct pnfs_layout_segment * 170
152pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 171bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
153 loff_t pos, u64 count, enum pnfs_iomode access_type, 172bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
154 gfp_t gfp_flags); 173
155void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 174void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
156void unset_pnfs_layoutdriver(struct nfs_server *); 175void unset_pnfs_layoutdriver(struct nfs_server *);
157enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, 176void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
158 const struct rpc_call_ops *, int); 177int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
159enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, 178void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
160 const struct rpc_call_ops *); 179int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
161bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); 180bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
181void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
162int pnfs_layout_process(struct nfs4_layoutget *lgp); 182int pnfs_layout_process(struct nfs4_layoutget *lgp);
163void pnfs_free_lseg_list(struct list_head *tmp_list); 183void pnfs_free_lseg_list(struct list_head *tmp_list);
164void pnfs_destroy_layout(struct nfs_inode *); 184void pnfs_destroy_layout(struct nfs_inode *);
@@ -178,10 +198,24 @@ void pnfs_roc_release(struct inode *ino);
178void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 198void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
179bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 199bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
180void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 200void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
201void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
181int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 202int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
182int _pnfs_return_layout(struct inode *); 203int _pnfs_return_layout(struct inode *);
183int pnfs_ld_write_done(struct nfs_write_data *); 204void pnfs_ld_write_done(struct nfs_write_data *);
184int pnfs_ld_read_done(struct nfs_read_data *); 205void pnfs_ld_read_done(struct nfs_read_data *);
206struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
207 struct nfs_open_context *ctx,
208 loff_t pos,
209 u64 count,
210 enum pnfs_iomode iomode,
211 gfp_t gfp_flags);
212
213void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
214
215/* nfs4_deviceid_flags */
216enum {
217 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
218};
185 219
186/* pnfs_dev.c */ 220/* pnfs_dev.c */
187struct nfs4_deviceid_node { 221struct nfs4_deviceid_node {
@@ -189,13 +223,13 @@ struct nfs4_deviceid_node {
189 struct hlist_node tmpnode; 223 struct hlist_node tmpnode;
190 const struct pnfs_layoutdriver_type *ld; 224 const struct pnfs_layoutdriver_type *ld;
191 const struct nfs_client *nfs_client; 225 const struct nfs_client *nfs_client;
226 unsigned long flags;
192 struct nfs4_deviceid deviceid; 227 struct nfs4_deviceid deviceid;
193 atomic_t ref; 228 atomic_t ref;
194}; 229};
195 230
196void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); 231void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
197struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 232struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
198struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
199void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 233void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
200void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, 234void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
201 const struct pnfs_layoutdriver_type *, 235 const struct pnfs_layoutdriver_type *,
@@ -293,15 +327,6 @@ static inline int pnfs_return_layout(struct inode *ino)
293 return 0; 327 return 0;
294} 328}
295 329
296static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
297 struct inode *inode)
298{
299 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
300
301 if (ld)
302 pgio->pg_test = ld->pg_test;
303}
304
305#else /* CONFIG_NFS_V4_1 */ 330#else /* CONFIG_NFS_V4_1 */
306 331
307static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 332static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -322,28 +347,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
322{ 347{
323} 348}
324 349
325static inline struct pnfs_layout_segment *
326pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
327 loff_t pos, u64 count, enum pnfs_iomode access_type,
328 gfp_t gfp_flags)
329{
330 return NULL;
331}
332
333static inline enum pnfs_try_status
334pnfs_try_to_read_data(struct nfs_read_data *data,
335 const struct rpc_call_ops *call_ops)
336{
337 return PNFS_NOT_ATTEMPTED;
338}
339
340static inline enum pnfs_try_status
341pnfs_try_to_write_data(struct nfs_write_data *data,
342 const struct rpc_call_ops *call_ops, int how)
343{
344 return PNFS_NOT_ATTEMPTED;
345}
346
347static inline int pnfs_return_layout(struct inode *ino) 350static inline int pnfs_return_layout(struct inode *ino)
348{ 351{
349 return 0; 352 return 0;
@@ -377,7 +380,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)
377 return false; 380 return false;
378} 381}
379 382
380static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) 383static inline void set_pnfs_layoutdriver(struct nfs_server *s,
384 const struct nfs_fh *mntfh, u32 id)
381{ 385{
382} 386}
383 387
@@ -385,9 +389,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
385{ 389{
386} 390}
387 391
388static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, 392static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
389 struct inode *inode)
390{ 393{
394 return false;
395}
396
397static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
398{
399 return false;
391} 400}
392 401
393static inline void 402static inline void
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index f0f8e1e22f6..6fda5228ef5 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -100,8 +100,8 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
100 100
101 rcu_read_lock(); 101 rcu_read_lock();
102 d = _lookup_deviceid(ld, clp, id, hash); 102 d = _lookup_deviceid(ld, clp, id, hash);
103 if (d && !atomic_inc_not_zero(&d->ref)) 103 if (d != NULL)
104 d = NULL; 104 atomic_inc(&d->ref);
105 rcu_read_unlock(); 105 rcu_read_unlock();
106 return d; 106 return d;
107} 107}
@@ -115,15 +115,15 @@ nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
115EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); 115EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
116 116
117/* 117/*
118 * Unhash and put deviceid 118 * Remove a deviceid from cache
119 * 119 *
120 * @clp nfs_client associated with deviceid 120 * @clp nfs_client associated with deviceid
121 * @id the deviceid to unhash 121 * @id the deviceid to unhash
122 * 122 *
123 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. 123 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
124 */ 124 */
125struct nfs4_deviceid_node * 125void
126nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, 126nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
127 const struct nfs_client *clp, const struct nfs4_deviceid *id) 127 const struct nfs_client *clp, const struct nfs4_deviceid *id)
128{ 128{
129 struct nfs4_deviceid_node *d; 129 struct nfs4_deviceid_node *d;
@@ -134,7 +134,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
134 rcu_read_unlock(); 134 rcu_read_unlock();
135 if (!d) { 135 if (!d) {
136 spin_unlock(&nfs4_deviceid_lock); 136 spin_unlock(&nfs4_deviceid_lock);
137 return NULL; 137 return;
138 } 138 }
139 hlist_del_init_rcu(&d->node); 139 hlist_del_init_rcu(&d->node);
140 spin_unlock(&nfs4_deviceid_lock); 140 spin_unlock(&nfs4_deviceid_lock);
@@ -142,28 +142,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
142 142
143 /* balance the initial ref set in pnfs_insert_deviceid */ 143 /* balance the initial ref set in pnfs_insert_deviceid */
144 if (atomic_dec_and_test(&d->ref)) 144 if (atomic_dec_and_test(&d->ref))
145 return d; 145 d->ld->free_deviceid_node(d);
146
147 return NULL;
148}
149EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
150
151/*
152 * Delete a deviceid from cache
153 *
154 * @clp struct nfs_client qualifying the deviceid
155 * @id deviceid to delete
156 */
157void
158nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
159 const struct nfs_client *clp, const struct nfs4_deviceid *id)
160{
161 struct nfs4_deviceid_node *d;
162
163 d = nfs4_unhash_put_deviceid(ld, clp, id);
164 if (!d)
165 return;
166 d->ld->free_deviceid_node(d);
167} 146}
168EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); 147EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
169 148
@@ -177,6 +156,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
177 INIT_HLIST_NODE(&d->tmpnode); 156 INIT_HLIST_NODE(&d->tmpnode);
178 d->ld = ld; 157 d->ld = ld;
179 d->nfs_client = nfs_client; 158 d->nfs_client = nfs_client;
159 d->flags = 0;
180 d->deviceid = *id; 160 d->deviceid = *id;
181 atomic_set(&d->ref, 1); 161 atomic_set(&d->ref, 1);
182} 162}
@@ -221,16 +201,15 @@ EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
221 * 201 *
222 * @d deviceid node to put 202 * @d deviceid node to put
223 * 203 *
224 * @ret true iff the node was deleted 204 * return true iff the node was deleted
205 * Note that since the test for d->ref == 0 is sufficient to establish
206 * that the node is no longer hashed in the global device id cache.
225 */ 207 */
226bool 208bool
227nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) 209nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
228{ 210{
229 if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) 211 if (!atomic_dec_and_test(&d->ref))
230 return false; 212 return false;
231 hlist_del_init_rcu(&d->node);
232 spin_unlock(&nfs4_deviceid_lock);
233 synchronize_rcu();
234 d->ld->free_deviceid_node(d); 213 d->ld->free_deviceid_node(d);
235 return true; 214 return true;
236} 215}
@@ -275,3 +254,22 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp)
275 for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) 254 for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
276 _deviceid_purge_client(clp, h); 255 _deviceid_purge_client(clp, h);
277} 256}
257
258/*
259 * Stop use of all deviceids associated with an nfs_client
260 */
261void
262nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
263{
264 struct nfs4_deviceid_node *d;
265 struct hlist_node *n;
266 int i;
267
268 rcu_read_lock();
269 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
270 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node)
271 if (d->nfs_client == clp)
272 set_bit(NFS_DEVICEID_INVALID, &d->flags);
273 }
274 rcu_read_unlock();
275}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b8535d7..f48125da198 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
710 .dentry_ops = &nfs_dentry_operations, 710 .dentry_ops = &nfs_dentry_operations,
711 .dir_inode_ops = &nfs_dir_inode_operations, 711 .dir_inode_ops = &nfs_dir_inode_operations,
712 .file_inode_ops = &nfs_file_inode_operations, 712 .file_inode_ops = &nfs_file_inode_operations,
713 .file_ops = &nfs_file_operations,
713 .getroot = nfs_proc_get_root, 714 .getroot = nfs_proc_get_root,
714 .getattr = nfs_proc_getattr, 715 .getattr = nfs_proc_getattr,
715 .setattr = nfs_proc_setattr, 716 .setattr = nfs_proc_setattr,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 20a7f952e24..bfc20b16024 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -30,8 +30,7 @@
30 30
31#define NFSDBG_FACILITY NFSDBG_PAGECACHE 31#define NFSDBG_FACILITY NFSDBG_PAGECACHE
32 32
33static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); 33static const struct nfs_pageio_ops nfs_pageio_read_ops;
34static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
35static const struct rpc_call_ops nfs_read_partial_ops; 34static const struct rpc_call_ops nfs_read_partial_ops;
36static const struct rpc_call_ops nfs_read_full_ops; 35static const struct rpc_call_ops nfs_read_full_ops;
37 36
@@ -68,7 +67,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
68 mempool_free(p, nfs_rdata_mempool); 67 mempool_free(p, nfs_rdata_mempool);
69} 68}
70 69
71static void nfs_readdata_release(struct nfs_read_data *rdata) 70void nfs_readdata_release(struct nfs_read_data *rdata)
72{ 71{
73 put_lseg(rdata->lseg); 72 put_lseg(rdata->lseg);
74 put_nfs_open_context(rdata->args.context); 73 put_nfs_open_context(rdata->args.context);
@@ -113,6 +112,27 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
113 } 112 }
114} 113}
115 114
115static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
116 struct inode *inode)
117{
118 nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
119 NFS_SERVER(inode)->rsize, 0);
120}
121
122void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
123{
124 pgio->pg_ops = &nfs_pageio_read_ops;
125 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
126}
127EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
128
129static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
130 struct inode *inode)
131{
132 if (!pnfs_pageio_init_read(pgio, inode))
133 nfs_pageio_init_read_mds(pgio, inode);
134}
135
116int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 136int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
117 struct page *page) 137 struct page *page)
118{ 138{
@@ -131,20 +151,15 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
131 if (len < PAGE_CACHE_SIZE) 151 if (len < PAGE_CACHE_SIZE)
132 zero_user_segment(page, len, PAGE_CACHE_SIZE); 152 zero_user_segment(page, len, PAGE_CACHE_SIZE);
133 153
134 nfs_pageio_init(&pgio, inode, NULL, 0, 0); 154 nfs_pageio_init_read(&pgio, inode);
135 nfs_list_add_request(new, &pgio.pg_list); 155 nfs_pageio_add_request(&pgio, new);
136 pgio.pg_count = len; 156 nfs_pageio_complete(&pgio);
137
138 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
139 nfs_pagein_multi(&pgio);
140 else
141 nfs_pagein_one(&pgio);
142 return 0; 157 return 0;
143} 158}
144 159
145static void nfs_readpage_release(struct nfs_page *req) 160static void nfs_readpage_release(struct nfs_page *req)
146{ 161{
147 struct inode *d_inode = req->wb_context->path.dentry->d_inode; 162 struct inode *d_inode = req->wb_context->dentry->d_inode;
148 163
149 if (PageUptodate(req->wb_page)) 164 if (PageUptodate(req->wb_page))
150 nfs_readpage_to_fscache(d_inode, req->wb_page, 0); 165 nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
@@ -152,8 +167,8 @@ static void nfs_readpage_release(struct nfs_page *req)
152 unlock_page(req->wb_page); 167 unlock_page(req->wb_page);
153 168
154 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 169 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
155 req->wb_context->path.dentry->d_inode->i_sb->s_id, 170 req->wb_context->dentry->d_inode->i_sb->s_id,
156 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 171 (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
157 req->wb_bytes, 172 req->wb_bytes,
158 (long long)req_offset(req)); 173 (long long)req_offset(req));
159 nfs_release_request(req); 174 nfs_release_request(req);
@@ -202,17 +217,14 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);
202/* 217/*
203 * Set up the NFS read request struct 218 * Set up the NFS read request struct
204 */ 219 */
205static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, 220static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
206 const struct rpc_call_ops *call_ops, 221 unsigned int count, unsigned int offset)
207 unsigned int count, unsigned int offset,
208 struct pnfs_layout_segment *lseg)
209{ 222{
210 struct inode *inode = req->wb_context->path.dentry->d_inode; 223 struct inode *inode = req->wb_context->dentry->d_inode;
211 224
212 data->req = req; 225 data->req = req;
213 data->inode = inode; 226 data->inode = inode;
214 data->cred = req->wb_context->cred; 227 data->cred = req->wb_context->cred;
215 data->lseg = get_lseg(lseg);
216 228
217 data->args.fh = NFS_FH(inode); 229 data->args.fh = NFS_FH(inode);
218 data->args.offset = req_offset(req) + offset; 230 data->args.offset = req_offset(req) + offset;
@@ -226,14 +238,36 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
226 data->res.count = count; 238 data->res.count = count;
227 data->res.eof = 0; 239 data->res.eof = 0;
228 nfs_fattr_init(&data->fattr); 240 nfs_fattr_init(&data->fattr);
241}
229 242
230 if (data->lseg && 243static int nfs_do_read(struct nfs_read_data *data,
231 (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) 244 const struct rpc_call_ops *call_ops)
232 return 0; 245{
246 struct inode *inode = data->args.context->dentry->d_inode;
233 247
234 return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); 248 return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
235} 249}
236 250
251static int
252nfs_do_multiple_reads(struct list_head *head,
253 const struct rpc_call_ops *call_ops)
254{
255 struct nfs_read_data *data;
256 int ret = 0;
257
258 while (!list_empty(head)) {
259 int ret2;
260
261 data = list_entry(head->next, struct nfs_read_data, list);
262 list_del_init(&data->list);
263
264 ret2 = nfs_do_read(data, call_ops);
265 if (ret == 0)
266 ret = ret2;
267 }
268 return ret;
269}
270
237static void 271static void
238nfs_async_read_error(struct list_head *head) 272nfs_async_read_error(struct list_head *head)
239{ 273{
@@ -260,20 +294,19 @@ nfs_async_read_error(struct list_head *head)
260 * won't see the new data until our attribute cache is updated. This is more 294 * won't see the new data until our attribute cache is updated. This is more
261 * or less conventional NFS client behavior. 295 * or less conventional NFS client behavior.
262 */ 296 */
263static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) 297static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
264{ 298{
265 struct nfs_page *req = nfs_list_entry(desc->pg_list.next); 299 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
266 struct page *page = req->wb_page; 300 struct page *page = req->wb_page;
267 struct nfs_read_data *data; 301 struct nfs_read_data *data;
268 size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; 302 size_t rsize = desc->pg_bsize, nbytes;
269 unsigned int offset; 303 unsigned int offset;
270 int requests = 0; 304 int requests = 0;
271 int ret = 0; 305 int ret = 0;
272 struct pnfs_layout_segment *lseg;
273 LIST_HEAD(list);
274 306
275 nfs_list_remove_request(req); 307 nfs_list_remove_request(req);
276 308
309 offset = 0;
277 nbytes = desc->pg_count; 310 nbytes = desc->pg_count;
278 do { 311 do {
279 size_t len = min(nbytes,rsize); 312 size_t len = min(nbytes,rsize);
@@ -281,45 +314,21 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
281 data = nfs_readdata_alloc(1); 314 data = nfs_readdata_alloc(1);
282 if (!data) 315 if (!data)
283 goto out_bad; 316 goto out_bad;
284 list_add(&data->pages, &list); 317 data->pagevec[0] = page;
318 nfs_read_rpcsetup(req, data, len, offset);
319 list_add(&data->list, res);
285 requests++; 320 requests++;
286 nbytes -= len; 321 nbytes -= len;
322 offset += len;
287 } while(nbytes != 0); 323 } while(nbytes != 0);
288 atomic_set(&req->wb_complete, requests); 324 atomic_set(&req->wb_complete, requests);
289
290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
292 req_offset(req), desc->pg_count,
293 IOMODE_READ, GFP_KERNEL);
294 ClearPageError(page); 325 ClearPageError(page);
295 offset = 0; 326 desc->pg_rpc_callops = &nfs_read_partial_ops;
296 nbytes = desc->pg_count;
297 do {
298 int ret2;
299
300 data = list_entry(list.next, struct nfs_read_data, pages);
301 list_del_init(&data->pages);
302
303 data->pagevec[0] = page;
304
305 if (nbytes < rsize)
306 rsize = nbytes;
307 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
308 rsize, offset, lseg);
309 if (ret == 0)
310 ret = ret2;
311 offset += rsize;
312 nbytes -= rsize;
313 } while (nbytes != 0);
314 put_lseg(lseg);
315 desc->pg_lseg = NULL;
316
317 return ret; 327 return ret;
318
319out_bad: 328out_bad:
320 while (!list_empty(&list)) { 329 while (!list_empty(res)) {
321 data = list_entry(list.next, struct nfs_read_data, pages); 330 data = list_entry(res->next, struct nfs_read_data, list);
322 list_del(&data->pages); 331 list_del(&data->list);
323 nfs_readdata_free(data); 332 nfs_readdata_free(data);
324 } 333 }
325 SetPageError(page); 334 SetPageError(page);
@@ -327,19 +336,19 @@ out_bad:
327 return -ENOMEM; 336 return -ENOMEM;
328} 337}
329 338
330static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) 339static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
331{ 340{
332 struct nfs_page *req; 341 struct nfs_page *req;
333 struct page **pages; 342 struct page **pages;
334 struct nfs_read_data *data; 343 struct nfs_read_data *data;
335 struct list_head *head = &desc->pg_list; 344 struct list_head *head = &desc->pg_list;
336 struct pnfs_layout_segment *lseg = desc->pg_lseg; 345 int ret = 0;
337 int ret = -ENOMEM;
338 346
339 data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, 347 data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
340 desc->pg_count)); 348 desc->pg_count));
341 if (!data) { 349 if (!data) {
342 nfs_async_read_error(head); 350 nfs_async_read_error(head);
351 ret = -ENOMEM;
343 goto out; 352 goto out;
344 } 353 }
345 354
@@ -352,19 +361,37 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
352 *pages++ = req->wb_page; 361 *pages++ = req->wb_page;
353 } 362 }
354 req = nfs_list_entry(data->pages.next); 363 req = nfs_list_entry(data->pages.next);
355 if ((!lseg) && list_is_singular(&data->pages))
356 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
357 req_offset(req), desc->pg_count,
358 IOMODE_READ, GFP_KERNEL);
359 364
360 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 365 nfs_read_rpcsetup(req, data, desc->pg_count, 0);
361 0, lseg); 366 list_add(&data->list, res);
367 desc->pg_rpc_callops = &nfs_read_full_ops;
362out: 368out:
363 put_lseg(lseg);
364 desc->pg_lseg = NULL;
365 return ret; 369 return ret;
366} 370}
367 371
372int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head)
373{
374 if (desc->pg_bsize < PAGE_CACHE_SIZE)
375 return nfs_pagein_multi(desc, head);
376 return nfs_pagein_one(desc, head);
377}
378
379static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
380{
381 LIST_HEAD(head);
382 int ret;
383
384 ret = nfs_generic_pagein(desc, &head);
385 if (ret == 0)
386 ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops);
387 return ret;
388}
389
390static const struct nfs_pageio_ops nfs_pageio_read_ops = {
391 .pg_test = nfs_generic_pg_test,
392 .pg_doio = nfs_generic_pg_readpages,
393};
394
368/* 395/*
369 * This is the callback from RPC telling us whether a reply was 396 * This is the callback from RPC telling us whether a reply was
370 * received or some error occurred (timeout or socket shutdown). 397 * received or some error occurred (timeout or socket shutdown).
@@ -514,13 +541,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
514static void nfs_readpage_release_full(void *calldata) 541static void nfs_readpage_release_full(void *calldata)
515{ 542{
516 struct nfs_read_data *data = calldata; 543 struct nfs_read_data *data = calldata;
544 struct nfs_pageio_descriptor pgio;
517 545
546 if (data->pnfs_error) {
547 nfs_pageio_init_read_mds(&pgio, data->inode);
548 pgio.pg_recoalesce = 1;
549 }
518 while (!list_empty(&data->pages)) { 550 while (!list_empty(&data->pages)) {
519 struct nfs_page *req = nfs_list_entry(data->pages.next); 551 struct nfs_page *req = nfs_list_entry(data->pages.next);
520 552
521 nfs_list_remove_request(req); 553 nfs_list_remove_request(req);
522 nfs_readpage_release(req); 554 if (!data->pnfs_error)
555 nfs_readpage_release(req);
556 else
557 nfs_pageio_add_request(&pgio, req);
523 } 558 }
559 if (data->pnfs_error)
560 nfs_pageio_complete(&pgio);
524 nfs_readdata_release(calldata); 561 nfs_readdata_release(calldata);
525} 562}
526 563
@@ -635,8 +672,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
635 .pgio = &pgio, 672 .pgio = &pgio,
636 }; 673 };
637 struct inode *inode = mapping->host; 674 struct inode *inode = mapping->host;
638 struct nfs_server *server = NFS_SERVER(inode);
639 size_t rsize = server->rsize;
640 unsigned long npages; 675 unsigned long npages;
641 int ret = -ESTALE; 676 int ret = -ESTALE;
642 677
@@ -664,10 +699,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
664 if (ret == 0) 699 if (ret == 0)
665 goto read_complete; /* all pages were read */ 700 goto read_complete; /* all pages were read */
666 701
667 if (rsize < PAGE_CACHE_SIZE) 702 nfs_pageio_init_read(&pgio, inode);
668 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
669 else
670 nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
671 703
672 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 704 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
673 705
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce40e5c568b..c4daf4eaad9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -904,10 +904,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
904 data->auth_flavor_len = 1; 904 data->auth_flavor_len = 1;
905 data->version = version; 905 data->version = version;
906 data->minorversion = 0; 906 data->minorversion = 0;
907 security_init_mnt_opts(&data->lsm_opts);
907 } 908 }
908 return data; 909 return data;
909} 910}
910 911
912static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
913{
914 if (data) {
915 kfree(data->client_address);
916 kfree(data->mount_server.hostname);
917 kfree(data->nfs_server.export_path);
918 kfree(data->nfs_server.hostname);
919 kfree(data->fscache_uniq);
920 security_free_mnt_opts(&data->lsm_opts);
921 kfree(data);
922 }
923}
924
911/* 925/*
912 * Sanity-check a server address provided by the mount command. 926 * Sanity-check a server address provided by the mount command.
913 * 927 *
@@ -2035,9 +2049,6 @@ static inline void nfs_initialise_sb(struct super_block *sb)
2035 sb->s_blocksize = nfs_block_bits(server->wsize, 2049 sb->s_blocksize = nfs_block_bits(server->wsize,
2036 &sb->s_blocksize_bits); 2050 &sb->s_blocksize_bits);
2037 2051
2038 if (server->flags & NFS_MOUNT_NOAC)
2039 sb->s_flags |= MS_SYNCHRONOUS;
2040
2041 sb->s_bdi = &server->backing_dev_info; 2052 sb->s_bdi = &server->backing_dev_info;
2042 2053
2043 nfs_super_set_maxbytes(sb, server->maxfilesize); 2054 nfs_super_set_maxbytes(sb, server->maxfilesize);
@@ -2218,9 +2229,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2218 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); 2229 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
2219 mntfh = nfs_alloc_fhandle(); 2230 mntfh = nfs_alloc_fhandle();
2220 if (data == NULL || mntfh == NULL) 2231 if (data == NULL || mntfh == NULL)
2221 goto out_free_fh; 2232 goto out;
2222
2223 security_init_mnt_opts(&data->lsm_opts);
2224 2233
2225 /* Validate the mount data */ 2234 /* Validate the mount data */
2226 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); 2235 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
@@ -2232,8 +2241,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2232#ifdef CONFIG_NFS_V4 2241#ifdef CONFIG_NFS_V4
2233 if (data->version == 4) { 2242 if (data->version == 4) {
2234 mntroot = nfs4_try_mount(flags, dev_name, data); 2243 mntroot = nfs4_try_mount(flags, dev_name, data);
2235 kfree(data->client_address);
2236 kfree(data->nfs_server.export_path);
2237 goto out; 2244 goto out;
2238 } 2245 }
2239#endif /* CONFIG_NFS_V4 */ 2246#endif /* CONFIG_NFS_V4 */
@@ -2249,6 +2256,10 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2249 if (server->flags & NFS_MOUNT_UNSHARED) 2256 if (server->flags & NFS_MOUNT_UNSHARED)
2250 compare_super = NULL; 2257 compare_super = NULL;
2251 2258
2259 /* -o noac implies -o sync */
2260 if (server->flags & NFS_MOUNT_NOAC)
2261 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2262
2252 /* Get a superblock - note that we may end up sharing one that already exists */ 2263 /* Get a superblock - note that we may end up sharing one that already exists */
2253 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); 2264 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
2254 if (IS_ERR(s)) { 2265 if (IS_ERR(s)) {
@@ -2284,13 +2295,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2284 s->s_flags |= MS_ACTIVE; 2295 s->s_flags |= MS_ACTIVE;
2285 2296
2286out: 2297out:
2287 kfree(data->nfs_server.hostname); 2298 nfs_free_parsed_mount_data(data);
2288 kfree(data->mount_server.hostname);
2289 kfree(data->fscache_uniq);
2290 security_free_mnt_opts(&data->lsm_opts);
2291out_free_fh:
2292 nfs_free_fhandle(mntfh); 2299 nfs_free_fhandle(mntfh);
2293 kfree(data);
2294 return mntroot; 2300 return mntroot;
2295 2301
2296out_err_nosb: 2302out_err_nosb:
@@ -2361,6 +2367,10 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2361 if (server->flags & NFS_MOUNT_UNSHARED) 2367 if (server->flags & NFS_MOUNT_UNSHARED)
2362 compare_super = NULL; 2368 compare_super = NULL;
2363 2369
2370 /* -o noac implies -o sync */
2371 if (server->flags & NFS_MOUNT_NOAC)
2372 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2373
2364 /* Get a superblock - note that we may end up sharing one that already exists */ 2374 /* Get a superblock - note that we may end up sharing one that already exists */
2365 s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2375 s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2366 if (IS_ERR(s)) { 2376 if (IS_ERR(s)) {
@@ -2613,9 +2623,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2613 2623
2614 mntfh = nfs_alloc_fhandle(); 2624 mntfh = nfs_alloc_fhandle();
2615 if (data == NULL || mntfh == NULL) 2625 if (data == NULL || mntfh == NULL)
2616 goto out_free_fh; 2626 goto out;
2617
2618 security_init_mnt_opts(&data->lsm_opts);
2619 2627
2620 /* Get a volume representation */ 2628 /* Get a volume representation */
2621 server = nfs4_create_server(data, mntfh); 2629 server = nfs4_create_server(data, mntfh);
@@ -2628,6 +2636,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2628 if (server->flags & NFS4_MOUNT_UNSHARED) 2636 if (server->flags & NFS4_MOUNT_UNSHARED)
2629 compare_super = NULL; 2637 compare_super = NULL;
2630 2638
2639 /* -o noac implies -o sync */
2640 if (server->flags & NFS_MOUNT_NOAC)
2641 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2642
2631 /* Get a superblock - note that we may end up sharing one that already exists */ 2643 /* Get a superblock - note that we may end up sharing one that already exists */
2632 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2644 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2633 if (IS_ERR(s)) { 2645 if (IS_ERR(s)) {
@@ -2663,13 +2675,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2663 2675
2664 s->s_flags |= MS_ACTIVE; 2676 s->s_flags |= MS_ACTIVE;
2665 2677
2666 security_free_mnt_opts(&data->lsm_opts);
2667 nfs_free_fhandle(mntfh); 2678 nfs_free_fhandle(mntfh);
2668 return mntroot; 2679 return mntroot;
2669 2680
2670out: 2681out:
2671 security_free_mnt_opts(&data->lsm_opts);
2672out_free_fh:
2673 nfs_free_fhandle(mntfh); 2682 nfs_free_fhandle(mntfh);
2674 return ERR_PTR(error); 2683 return ERR_PTR(error);
2675 2684
@@ -2773,16 +2782,12 @@ static void nfs_referral_loop_unprotect(void)
2773static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, 2782static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2774 const char *export_path) 2783 const char *export_path)
2775{ 2784{
2776 struct nameidata *nd = NULL;
2777 struct mnt_namespace *ns_private; 2785 struct mnt_namespace *ns_private;
2778 struct super_block *s; 2786 struct super_block *s;
2779 struct dentry *dentry; 2787 struct dentry *dentry;
2788 struct path path;
2780 int ret; 2789 int ret;
2781 2790
2782 nd = kmalloc(sizeof(*nd), GFP_KERNEL);
2783 if (nd == NULL)
2784 return ERR_PTR(-ENOMEM);
2785
2786 ns_private = create_mnt_ns(root_mnt); 2791 ns_private = create_mnt_ns(root_mnt);
2787 ret = PTR_ERR(ns_private); 2792 ret = PTR_ERR(ns_private);
2788 if (IS_ERR(ns_private)) 2793 if (IS_ERR(ns_private))
@@ -2793,7 +2798,7 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2793 goto out_put_mnt_ns; 2798 goto out_put_mnt_ns;
2794 2799
2795 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2800 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
2796 export_path, LOOKUP_FOLLOW, nd); 2801 export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2797 2802
2798 nfs_referral_loop_unprotect(); 2803 nfs_referral_loop_unprotect();
2799 put_mnt_ns(ns_private); 2804 put_mnt_ns(ns_private);
@@ -2801,12 +2806,11 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2801 if (ret != 0) 2806 if (ret != 0)
2802 goto out_err; 2807 goto out_err;
2803 2808
2804 s = nd->path.mnt->mnt_sb; 2809 s = path.mnt->mnt_sb;
2805 atomic_inc(&s->s_active); 2810 atomic_inc(&s->s_active);
2806 dentry = dget(nd->path.dentry); 2811 dentry = dget(path.dentry);
2807 2812
2808 path_put(&nd->path); 2813 path_put(&path);
2809 kfree(nd);
2810 down_write(&s->s_umount); 2814 down_write(&s->s_umount);
2811 return dentry; 2815 return dentry;
2812out_put_mnt_ns: 2816out_put_mnt_ns:
@@ -2814,7 +2818,6 @@ out_put_mnt_ns:
2814out_mntput: 2818out_mntput:
2815 mntput(root_mnt); 2819 mntput(root_mnt);
2816out_err: 2820out_err:
2817 kfree(nd);
2818 return ERR_PTR(ret); 2821 return ERR_PTR(ret);
2819} 2822}
2820 2823
@@ -2855,7 +2858,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
2855 2858
2856 data = nfs_alloc_parsed_mount_data(4); 2859 data = nfs_alloc_parsed_mount_data(4);
2857 if (data == NULL) 2860 if (data == NULL)
2858 goto out_free_data; 2861 goto out;
2859 2862
2860 /* Validate the mount data */ 2863 /* Validate the mount data */
2861 error = nfs4_validate_mount_data(raw_data, data, dev_name); 2864 error = nfs4_validate_mount_data(raw_data, data, dev_name);
@@ -2869,12 +2872,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
2869 error = PTR_ERR(res); 2872 error = PTR_ERR(res);
2870 2873
2871out: 2874out:
2872 kfree(data->client_address); 2875 nfs_free_parsed_mount_data(data);
2873 kfree(data->nfs_server.export_path);
2874 kfree(data->nfs_server.hostname);
2875 kfree(data->fscache_uniq);
2876out_free_data:
2877 kfree(data);
2878 dprintk("<-- nfs4_mount() = %d%s\n", error, 2876 dprintk("<-- nfs4_mount() = %d%s\n", error,
2879 error != 0 ? " [error]" : ""); 2877 error != 0 ? " [error]" : "");
2880 return res; 2878 return res;
@@ -2922,6 +2920,10 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
2922 if (server->flags & NFS4_MOUNT_UNSHARED) 2920 if (server->flags & NFS4_MOUNT_UNSHARED)
2923 compare_super = NULL; 2921 compare_super = NULL;
2924 2922
2923 /* -o noac implies -o sync */
2924 if (server->flags & NFS_MOUNT_NOAC)
2925 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2926
2925 /* Get a superblock - note that we may end up sharing one that already exists */ 2927 /* Get a superblock - note that we may end up sharing one that already exists */
2926 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2928 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2927 if (IS_ERR(s)) { 2929 if (IS_ERR(s)) {
@@ -3009,6 +3011,10 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
3009 if (server->flags & NFS4_MOUNT_UNSHARED) 3011 if (server->flags & NFS4_MOUNT_UNSHARED)
3010 compare_super = NULL; 3012 compare_super = NULL;
3011 3013
3014 /* -o noac implies -o sync */
3015 if (server->flags & NFS_MOUNT_NOAC)
3016 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
3017
3012 /* Get a superblock - note that we may end up sharing one that already exists */ 3018 /* Get a superblock - note that we may end up sharing one that already exists */
3013 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 3019 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
3014 if (IS_ERR(s)) { 3020 if (IS_ERR(s)) {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8d6864c2a5f..b2fbbde58e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -147,7 +147,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
147 147
148 alias = d_lookup(parent, &data->args.name); 148 alias = d_lookup(parent, &data->args.name);
149 if (alias != NULL) { 149 if (alias != NULL) {
150 int ret = 0; 150 int ret;
151 void *devname_garbage = NULL; 151 void *devname_garbage = NULL;
152 152
153 /* 153 /*
@@ -155,14 +155,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
155 * the sillyrename information to the aliased dentry. 155 * the sillyrename information to the aliased dentry.
156 */ 156 */
157 nfs_free_dname(data); 157 nfs_free_dname(data);
158 ret = nfs_copy_dname(alias, data);
158 spin_lock(&alias->d_lock); 159 spin_lock(&alias->d_lock);
159 if (alias->d_inode != NULL && 160 if (ret == 0 && alias->d_inode != NULL &&
160 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { 161 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
161 devname_garbage = alias->d_fsdata; 162 devname_garbage = alias->d_fsdata;
162 alias->d_fsdata = data; 163 alias->d_fsdata = data;
163 alias->d_flags |= DCACHE_NFSFS_RENAMED; 164 alias->d_flags |= DCACHE_NFSFS_RENAMED;
164 ret = 1; 165 ret = 1;
165 } 166 } else
167 ret = 0;
166 spin_unlock(&alias->d_lock); 168 spin_unlock(&alias->d_lock);
167 nfs_dec_sillycount(dir); 169 nfs_dec_sillycount(dir);
168 dput(alias); 170 dput(alias);
@@ -171,8 +173,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
171 * point dentry is definitely not a root, so we won't need 173 * point dentry is definitely not a root, so we won't need
172 * that anymore. 174 * that anymore.
173 */ 175 */
174 if (devname_garbage) 176 kfree(devname_garbage);
175 kfree(devname_garbage);
176 return ret; 177 return ret;
177 } 178 }
178 data->dir = igrab(dir); 179 data->dir = igrab(dir);
@@ -204,8 +205,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
204 if (parent == NULL) 205 if (parent == NULL)
205 goto out_free; 206 goto out_free;
206 dir = parent->d_inode; 207 dir = parent->d_inode;
207 if (nfs_copy_dname(dentry, data) != 0)
208 goto out_dput;
209 /* Non-exclusive lock protects against concurrent lookup() calls */ 208 /* Non-exclusive lock protects against concurrent lookup() calls */
210 spin_lock(&dir->i_lock); 209 spin_lock(&dir->i_lock);
211 if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { 210 if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
@@ -366,6 +365,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
366 struct nfs_renamedata *data = calldata; 365 struct nfs_renamedata *data = calldata;
367 struct inode *old_dir = data->old_dir; 366 struct inode *old_dir = data->old_dir;
368 struct inode *new_dir = data->new_dir; 367 struct inode *new_dir = data->new_dir;
368 struct dentry *old_dentry = data->old_dentry;
369 struct dentry *new_dentry = data->new_dentry;
369 370
370 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { 371 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
371 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); 372 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
@@ -373,12 +374,12 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
373 } 374 }
374 375
375 if (task->tk_status != 0) { 376 if (task->tk_status != 0) {
376 nfs_cancel_async_unlink(data->old_dentry); 377 nfs_cancel_async_unlink(old_dentry);
377 return; 378 return;
378 } 379 }
379 380
380 nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); 381 d_drop(old_dentry);
381 d_move(data->old_dentry, data->new_dentry); 382 d_drop(new_dentry);
382} 383}
383 384
384/** 385/**
@@ -501,6 +502,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
501 * and only performs the unlink once the last reference to it is put. 502 * and only performs the unlink once the last reference to it is put.
502 * 503 *
503 * The final cleanup is done during dentry_iput. 504 * The final cleanup is done during dentry_iput.
505 *
506 * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server
507 * could take responsibility for keeping open files referenced. The server
508 * would also need to ensure that opened-but-deleted files were kept over
509 * reboots. However, we may not assume a server does so. (RFC 5661
510 * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can
511 * use to advertise that it does this; some day we may take advantage of
512 * it.))
504 */ 513 */
505int 514int
506nfs_sillyrename(struct inode *dir, struct dentry *dentry) 515nfs_sillyrename(struct inode *dir, struct dentry *dentry)
@@ -560,6 +569,14 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
560 if (error) 569 if (error)
561 goto out_dput; 570 goto out_dput;
562 571
572 /* populate unlinkdata with the right dname */
573 error = nfs_copy_dname(sdentry,
574 (struct nfs_unlinkdata *)dentry->d_fsdata);
575 if (error) {
576 nfs_cancel_async_unlink(dentry);
577 goto out_dput;
578 }
579
563 /* run the rename task, undo unlink if it fails */ 580 /* run the rename task, undo unlink if it fails */
564 task = nfs_async_rename(dir, dir, dentry, sdentry); 581 task = nfs_async_rename(dir, dir, dentry, sdentry);
565 if (IS_ERR(task)) { 582 if (IS_ERR(task)) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 72716805968..106fd0634ab 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -97,7 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
97 mempool_free(p, nfs_wdata_mempool); 97 mempool_free(p, nfs_wdata_mempool);
98} 98}
99 99
100static void nfs_writedata_release(struct nfs_write_data *wdata) 100void nfs_writedata_release(struct nfs_write_data *wdata)
101{ 101{
102 put_lseg(wdata->lseg); 102 put_lseg(wdata->lseg);
103 put_nfs_open_context(wdata->args.context); 103 put_nfs_open_context(wdata->args.context);
@@ -409,7 +409,7 @@ out:
409 */ 409 */
410static void nfs_inode_remove_request(struct nfs_page *req) 410static void nfs_inode_remove_request(struct nfs_page *req)
411{ 411{
412 struct inode *inode = req->wb_context->path.dentry->d_inode; 412 struct inode *inode = req->wb_context->dentry->d_inode;
413 struct nfs_inode *nfsi = NFS_I(inode); 413 struct nfs_inode *nfsi = NFS_I(inode);
414 414
415 BUG_ON (!NFS_WBACK_BUSY(req)); 415 BUG_ON (!NFS_WBACK_BUSY(req));
@@ -428,7 +428,6 @@ static void
428nfs_mark_request_dirty(struct nfs_page *req) 428nfs_mark_request_dirty(struct nfs_page *req)
429{ 429{
430 __set_page_dirty_nobuffers(req->wb_page); 430 __set_page_dirty_nobuffers(req->wb_page);
431 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
432} 431}
433 432
434#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 433#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -438,7 +437,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
438static void 437static void
439nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) 438nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
440{ 439{
441 struct inode *inode = req->wb_context->path.dentry->d_inode; 440 struct inode *inode = req->wb_context->dentry->d_inode;
442 struct nfs_inode *nfsi = NFS_I(inode); 441 struct nfs_inode *nfsi = NFS_I(inode);
443 442
444 spin_lock(&inode->i_lock); 443 spin_lock(&inode->i_lock);
@@ -762,6 +761,8 @@ int nfs_updatepage(struct file *file, struct page *page,
762 status = nfs_writepage_setup(ctx, page, offset, count); 761 status = nfs_writepage_setup(ctx, page, offset, count);
763 if (status < 0) 762 if (status < 0)
764 nfs_set_pageerror(page); 763 nfs_set_pageerror(page);
764 else
765 __set_page_dirty_nobuffers(page);
765 766
766 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 767 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
767 status, (long long)i_size_read(inode)); 768 status, (long long)i_size_read(inode));
@@ -845,22 +846,19 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);
845/* 846/*
846 * Set up the argument/result storage required for the RPC call. 847 * Set up the argument/result storage required for the RPC call.
847 */ 848 */
848static int nfs_write_rpcsetup(struct nfs_page *req, 849static void nfs_write_rpcsetup(struct nfs_page *req,
849 struct nfs_write_data *data, 850 struct nfs_write_data *data,
850 const struct rpc_call_ops *call_ops,
851 unsigned int count, unsigned int offset, 851 unsigned int count, unsigned int offset,
852 struct pnfs_layout_segment *lseg,
853 int how) 852 int how)
854{ 853{
855 struct inode *inode = req->wb_context->path.dentry->d_inode; 854 struct inode *inode = req->wb_context->dentry->d_inode;
856 855
857 /* Set up the RPC argument and reply structs 856 /* Set up the RPC argument and reply structs
858 * NB: take care not to mess about with data->commit et al. */ 857 * NB: take care not to mess about with data->commit et al. */
859 858
860 data->req = req; 859 data->req = req;
861 data->inode = inode = req->wb_context->path.dentry->d_inode; 860 data->inode = inode = req->wb_context->dentry->d_inode;
862 data->cred = req->wb_context->cred; 861 data->cred = req->wb_context->cred;
863 data->lseg = get_lseg(lseg);
864 862
865 data->args.fh = NFS_FH(inode); 863 data->args.fh = NFS_FH(inode);
866 data->args.offset = req_offset(req) + offset; 864 data->args.offset = req_offset(req) + offset;
@@ -872,24 +870,51 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
872 data->args.context = get_nfs_open_context(req->wb_context); 870 data->args.context = get_nfs_open_context(req->wb_context);
873 data->args.lock_context = req->wb_lock_context; 871 data->args.lock_context = req->wb_lock_context;
874 data->args.stable = NFS_UNSTABLE; 872 data->args.stable = NFS_UNSTABLE;
875 if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { 873 switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
876 data->args.stable = NFS_DATA_SYNC; 874 case 0:
877 if (!nfs_need_commit(NFS_I(inode))) 875 break;
878 data->args.stable = NFS_FILE_SYNC; 876 case FLUSH_COND_STABLE:
877 if (nfs_need_commit(NFS_I(inode)))
878 break;
879 default:
880 data->args.stable = NFS_FILE_SYNC;
879 } 881 }
880 882
881 data->res.fattr = &data->fattr; 883 data->res.fattr = &data->fattr;
882 data->res.count = count; 884 data->res.count = count;
883 data->res.verf = &data->verf; 885 data->res.verf = &data->verf;
884 nfs_fattr_init(&data->fattr); 886 nfs_fattr_init(&data->fattr);
887}
885 888
886 if (data->lseg && 889static int nfs_do_write(struct nfs_write_data *data,
887 (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) 890 const struct rpc_call_ops *call_ops,
888 return 0; 891 int how)
892{
893 struct inode *inode = data->args.context->dentry->d_inode;
889 894
890 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); 895 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
891} 896}
892 897
898static int nfs_do_multiple_writes(struct list_head *head,
899 const struct rpc_call_ops *call_ops,
900 int how)
901{
902 struct nfs_write_data *data;
903 int ret = 0;
904
905 while (!list_empty(head)) {
906 int ret2;
907
908 data = list_entry(head->next, struct nfs_write_data, list);
909 list_del_init(&data->list);
910
911 ret2 = nfs_do_write(data, call_ops, how);
912 if (ret == 0)
913 ret = ret2;
914 }
915 return ret;
916}
917
893/* If a nfs_flush_* function fails, it should remove reqs from @head and 918/* If a nfs_flush_* function fails, it should remove reqs from @head and
894 * call this on each, which will prepare them to be retried on next 919 * call this on each, which will prepare them to be retried on next
895 * writeback using standard nfs. 920 * writeback using standard nfs.
@@ -907,17 +932,15 @@ static void nfs_redirty_request(struct nfs_page *req)
907 * Generate multiple small requests to write out a single 932 * Generate multiple small requests to write out a single
908 * contiguous dirty area on one page. 933 * contiguous dirty area on one page.
909 */ 934 */
910static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) 935static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
911{ 936{
912 struct nfs_page *req = nfs_list_entry(desc->pg_list.next); 937 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
913 struct page *page = req->wb_page; 938 struct page *page = req->wb_page;
914 struct nfs_write_data *data; 939 struct nfs_write_data *data;
915 size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; 940 size_t wsize = desc->pg_bsize, nbytes;
916 unsigned int offset; 941 unsigned int offset;
917 int requests = 0; 942 int requests = 0;
918 int ret = 0; 943 int ret = 0;
919 struct pnfs_layout_segment *lseg;
920 LIST_HEAD(list);
921 944
922 nfs_list_remove_request(req); 945 nfs_list_remove_request(req);
923 946
@@ -927,6 +950,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
927 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 950 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
928 951
929 952
953 offset = 0;
930 nbytes = desc->pg_count; 954 nbytes = desc->pg_count;
931 do { 955 do {
932 size_t len = min(nbytes, wsize); 956 size_t len = min(nbytes, wsize);
@@ -934,45 +958,21 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
934 data = nfs_writedata_alloc(1); 958 data = nfs_writedata_alloc(1);
935 if (!data) 959 if (!data)
936 goto out_bad; 960 goto out_bad;
937 list_add(&data->pages, &list); 961 data->pagevec[0] = page;
962 nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
963 list_add(&data->list, res);
938 requests++; 964 requests++;
939 nbytes -= len; 965 nbytes -= len;
966 offset += len;
940 } while (nbytes != 0); 967 } while (nbytes != 0);
941 atomic_set(&req->wb_complete, requests); 968 atomic_set(&req->wb_complete, requests);
942 969 desc->pg_rpc_callops = &nfs_write_partial_ops;
943 BUG_ON(desc->pg_lseg);
944 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
945 req_offset(req), desc->pg_count,
946 IOMODE_RW, GFP_NOFS);
947 ClearPageError(page);
948 offset = 0;
949 nbytes = desc->pg_count;
950 do {
951 int ret2;
952
953 data = list_entry(list.next, struct nfs_write_data, pages);
954 list_del_init(&data->pages);
955
956 data->pagevec[0] = page;
957
958 if (nbytes < wsize)
959 wsize = nbytes;
960 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
961 wsize, offset, lseg, desc->pg_ioflags);
962 if (ret == 0)
963 ret = ret2;
964 offset += wsize;
965 nbytes -= wsize;
966 } while (nbytes != 0);
967
968 put_lseg(lseg);
969 desc->pg_lseg = NULL;
970 return ret; 970 return ret;
971 971
972out_bad: 972out_bad:
973 while (!list_empty(&list)) { 973 while (!list_empty(res)) {
974 data = list_entry(list.next, struct nfs_write_data, pages); 974 data = list_entry(res->next, struct nfs_write_data, list);
975 list_del(&data->pages); 975 list_del(&data->list);
976 nfs_writedata_free(data); 976 nfs_writedata_free(data);
977 } 977 }
978 nfs_redirty_request(req); 978 nfs_redirty_request(req);
@@ -987,14 +987,13 @@ out_bad:
987 * This is the case if nfs_updatepage detects a conflicting request 987 * This is the case if nfs_updatepage detects a conflicting request
988 * that has been written but not committed. 988 * that has been written but not committed.
989 */ 989 */
990static int nfs_flush_one(struct nfs_pageio_descriptor *desc) 990static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
991{ 991{
992 struct nfs_page *req; 992 struct nfs_page *req;
993 struct page **pages; 993 struct page **pages;
994 struct nfs_write_data *data; 994 struct nfs_write_data *data;
995 struct list_head *head = &desc->pg_list; 995 struct list_head *head = &desc->pg_list;
996 struct pnfs_layout_segment *lseg = desc->pg_lseg; 996 int ret = 0;
997 int ret;
998 997
999 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, 998 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
1000 desc->pg_count)); 999 desc->pg_count));
@@ -1016,32 +1015,62 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
1016 *pages++ = req->wb_page; 1015 *pages++ = req->wb_page;
1017 } 1016 }
1018 req = nfs_list_entry(data->pages.next); 1017 req = nfs_list_entry(data->pages.next);
1019 if ((!lseg) && list_is_singular(&data->pages))
1020 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
1021 req_offset(req), desc->pg_count,
1022 IOMODE_RW, GFP_NOFS);
1023 1018
1024 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 1019 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1025 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) 1020 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
1026 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 1021 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
1027 1022
1028 /* Set up the argument struct */ 1023 /* Set up the argument struct */
1029 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); 1024 nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags);
1025 list_add(&data->list, res);
1026 desc->pg_rpc_callops = &nfs_write_full_ops;
1030out: 1027out:
1031 put_lseg(lseg); /* Cleans any gotten in ->pg_test */
1032 desc->pg_lseg = NULL;
1033 return ret; 1028 return ret;
1034} 1029}
1035 1030
1036static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 1031int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head)
1032{
1033 if (desc->pg_bsize < PAGE_CACHE_SIZE)
1034 return nfs_flush_multi(desc, head);
1035 return nfs_flush_one(desc, head);
1036}
1037
1038static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1039{
1040 LIST_HEAD(head);
1041 int ret;
1042
1043 ret = nfs_generic_flush(desc, &head);
1044 if (ret == 0)
1045 ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops,
1046 desc->pg_ioflags);
1047 return ret;
1048}
1049
1050static const struct nfs_pageio_ops nfs_pageio_write_ops = {
1051 .pg_test = nfs_generic_pg_test,
1052 .pg_doio = nfs_generic_pg_writepages,
1053};
1054
1055static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
1037 struct inode *inode, int ioflags) 1056 struct inode *inode, int ioflags)
1038{ 1057{
1039 size_t wsize = NFS_SERVER(inode)->wsize; 1058 nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
1059 NFS_SERVER(inode)->wsize, ioflags);
1060}
1040 1061
1041 if (wsize < PAGE_CACHE_SIZE) 1062void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
1042 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1063{
1043 else 1064 pgio->pg_ops = &nfs_pageio_write_ops;
1044 nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); 1065 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
1066}
1067EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
1068
1069static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
1070 struct inode *inode, int ioflags)
1071{
1072 if (!pnfs_pageio_init_write(pgio, inode, ioflags))
1073 nfs_pageio_init_write_mds(pgio, inode, ioflags);
1045} 1074}
1046 1075
1047/* 1076/*
@@ -1053,9 +1082,9 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
1053 1082
1054 dprintk("NFS: %5u write(%s/%lld %d@%lld)", 1083 dprintk("NFS: %5u write(%s/%lld %d@%lld)",
1055 task->tk_pid, 1084 task->tk_pid,
1056 data->req->wb_context->path.dentry->d_inode->i_sb->s_id, 1085 data->req->wb_context->dentry->d_inode->i_sb->s_id,
1057 (long long) 1086 (long long)
1058 NFS_FILEID(data->req->wb_context->path.dentry->d_inode), 1087 NFS_FILEID(data->req->wb_context->dentry->d_inode),
1059 data->req->wb_bytes, (long long)req_offset(data->req)); 1088 data->req->wb_bytes, (long long)req_offset(data->req));
1060 1089
1061 nfs_writeback_done(task, data); 1090 nfs_writeback_done(task, data);
@@ -1137,7 +1166,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1137static void nfs_writeback_release_full(void *calldata) 1166static void nfs_writeback_release_full(void *calldata)
1138{ 1167{
1139 struct nfs_write_data *data = calldata; 1168 struct nfs_write_data *data = calldata;
1140 int status = data->task.tk_status; 1169 int ret, status = data->task.tk_status;
1170 struct nfs_pageio_descriptor pgio;
1171
1172 if (data->pnfs_error) {
1173 nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
1174 pgio.pg_recoalesce = 1;
1175 }
1141 1176
1142 /* Update attributes as result of writeback. */ 1177 /* Update attributes as result of writeback. */
1143 while (!list_empty(&data->pages)) { 1178 while (!list_empty(&data->pages)) {
@@ -1148,11 +1183,16 @@ static void nfs_writeback_release_full(void *calldata)
1148 1183
1149 dprintk("NFS: %5u write (%s/%lld %d@%lld)", 1184 dprintk("NFS: %5u write (%s/%lld %d@%lld)",
1150 data->task.tk_pid, 1185 data->task.tk_pid,
1151 req->wb_context->path.dentry->d_inode->i_sb->s_id, 1186 req->wb_context->dentry->d_inode->i_sb->s_id,
1152 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 1187 (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
1153 req->wb_bytes, 1188 req->wb_bytes,
1154 (long long)req_offset(req)); 1189 (long long)req_offset(req));
1155 1190
1191 if (data->pnfs_error) {
1192 dprintk(", pnfs error = %d\n", data->pnfs_error);
1193 goto next;
1194 }
1195
1156 if (status < 0) { 1196 if (status < 0) {
1157 nfs_set_pageerror(page); 1197 nfs_set_pageerror(page);
1158 nfs_context_set_write_error(req->wb_context, status); 1198 nfs_context_set_write_error(req->wb_context, status);
@@ -1172,7 +1212,19 @@ remove_request:
1172 next: 1212 next:
1173 nfs_clear_page_tag_locked(req); 1213 nfs_clear_page_tag_locked(req);
1174 nfs_end_page_writeback(page); 1214 nfs_end_page_writeback(page);
1215 if (data->pnfs_error) {
1216 lock_page(page);
1217 nfs_pageio_cond_complete(&pgio, page->index);
1218 ret = nfs_page_async_flush(&pgio, page, 0);
1219 if (ret) {
1220 nfs_set_pageerror(page);
1221 dprintk("rewrite to MDS error = %d\n", ret);
1222 }
1223 unlock_page(page);
1224 }
1175 } 1225 }
1226 if (data->pnfs_error)
1227 nfs_pageio_complete(&pgio);
1176 nfs_writedata_release(calldata); 1228 nfs_writedata_release(calldata);
1177} 1229}
1178 1230
@@ -1347,7 +1399,7 @@ void nfs_init_commit(struct nfs_write_data *data,
1347 struct pnfs_layout_segment *lseg) 1399 struct pnfs_layout_segment *lseg)
1348{ 1400{
1349 struct nfs_page *first = nfs_list_entry(head->next); 1401 struct nfs_page *first = nfs_list_entry(head->next);
1350 struct inode *inode = first->wb_context->path.dentry->d_inode; 1402 struct inode *inode = first->wb_context->dentry->d_inode;
1351 1403
1352 /* Set up the RPC argument and reply structs 1404 /* Set up the RPC argument and reply structs
1353 * NB: take care not to mess about with data->commit et al. */ 1405 * NB: take care not to mess about with data->commit et al. */
@@ -1435,8 +1487,8 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
1435 nfs_clear_request_commit(req); 1487 nfs_clear_request_commit(req);
1436 1488
1437 dprintk("NFS: commit (%s/%lld %d@%lld)", 1489 dprintk("NFS: commit (%s/%lld %d@%lld)",
1438 req->wb_context->path.dentry->d_inode->i_sb->s_id, 1490 req->wb_context->dentry->d_sb->s_id,
1439 (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), 1491 (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
1440 req->wb_bytes, 1492 req->wb_bytes,
1441 (long long)req_offset(req)); 1493 (long long)req_offset(req));
1442 if (status < 0) { 1494 if (status < 0) {
@@ -1525,6 +1577,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1525 int flags = FLUSH_SYNC; 1577 int flags = FLUSH_SYNC;
1526 int ret = 0; 1578 int ret = 0;
1527 1579
1580 /* no commits means nothing needs to be done */
1581 if (!nfsi->ncommit)
1582 return ret;
1583
1528 if (wbc->sync_mode == WB_SYNC_NONE) { 1584 if (wbc->sync_mode == WB_SYNC_NONE) {
1529 /* Don't commit yet if this is a non-blocking flush and there 1585 /* Don't commit yet if this is a non-blocking flush and there
1530 * are a lot of outstanding writes for this mapping. 1586 * are a lot of outstanding writes for this mapping.
@@ -1566,8 +1622,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1566 int status; 1622 int status;
1567 bool sync = true; 1623 bool sync = true;
1568 1624
1569 if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || 1625 if (wbc->sync_mode == WB_SYNC_NONE)
1570 wbc->for_background)
1571 sync = false; 1626 sync = false;
1572 1627
1573 status = pnfs_layoutcommit_inode(inode, sync); 1628 status = pnfs_layoutcommit_inode(inode, sync);
@@ -1659,34 +1714,20 @@ out_error:
1659int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1714int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1660 struct page *page) 1715 struct page *page)
1661{ 1716{
1662 struct nfs_page *req; 1717 /*
1663 int ret; 1718 * If PagePrivate is set, then the page is currently associated with
1719 * an in-progress read or write request. Don't try to migrate it.
1720 *
1721 * FIXME: we could do this in principle, but we'll need a way to ensure
1722 * that we can safely release the inode reference while holding
1723 * the page lock.
1724 */
1725 if (PagePrivate(page))
1726 return -EBUSY;
1664 1727
1665 nfs_fscache_release_page(page, GFP_KERNEL); 1728 nfs_fscache_release_page(page, GFP_KERNEL);
1666 1729
1667 req = nfs_find_and_lock_request(page, false); 1730 return migrate_page(mapping, newpage, page);
1668 ret = PTR_ERR(req);
1669 if (IS_ERR(req))
1670 goto out;
1671
1672 ret = migrate_page(mapping, newpage, page);
1673 if (!req)
1674 goto out;
1675 if (ret)
1676 goto out_unlock;
1677 page_cache_get(newpage);
1678 spin_lock(&mapping->host->i_lock);
1679 req->wb_page = newpage;
1680 SetPagePrivate(newpage);
1681 set_page_private(newpage, (unsigned long)req);
1682 ClearPagePrivate(page);
1683 set_page_private(page, 0);
1684 spin_unlock(&mapping->host->i_lock);
1685 page_cache_release(page);
1686out_unlock:
1687 nfs_clear_page_tag_locked(req);
1688out:
1689 return ret;
1690} 1731}
1691#endif 1732#endif
1692 1733