aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-31 12:26:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-31 12:26:50 -0400
commit24c3047095fa3954f114bfff2e37b8fcbb216396 (patch)
treea2263a4425d511ae619ca8b055705261dab9ec12 /fs/nfs
parent6581058f44533f9d45548bcfe986c125376859e9 (diff)
parent71cdd40fd498f12679070def668f6a4719ddbd1c (diff)
Merge branch 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
* 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (28 commits) pnfsblock: write_pagelist handle zero invalid extents pnfsblock: note written INVAL areas for layoutcommit pnfsblock: bl_write_pagelist pnfsblock: bl_read_pagelist pnfsblock: cleanup_layoutcommit pnfsblock: encode_layoutcommit pnfsblock: merge rw extents pnfsblock: add extent manipulation functions pnfsblock: bl_find_get_extent pnfsblock: xdr decode pnfs_block_layout4 pnfsblock: call and parse getdevicelist pnfsblock: merge extents pnfsblock: lseg alloc and free pnfsblock: remove device operations pnfsblock: add device operations pnfsblock: basic extent code pnfsblock: use pageio_ops api pnfsblock: add blocklayout Kconfig option, Makefile, and stubs pnfs: cleanup_layoutcommit pnfs: ask for layout_blksize and save it in nfs_server ...
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/Makefile5
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1019
-rw-r--r--fs/nfs/blocklayout/blocklayout.h207
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c410
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c111
-rw-r--r--fs/nfs/blocklayout/extents.c935
-rw-r--r--fs/nfs/client.c11
-rw-r--r--fs/nfs/dir.c57
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4filelayout.c2
-rw-r--r--fs/nfs/nfs4proc.c62
-rw-r--r--fs/nfs/nfs4xdr.c233
-rw-r--r--fs/nfs/pnfs.c86
-rw-r--r--fs/nfs/pnfs.h28
16 files changed, 3090 insertions, 87 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2cde5d954750..be020771c6b4 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -79,15 +79,21 @@ config NFS_V4_1
79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select SUNRPC_BACKCHANNEL 80 select SUNRPC_BACKCHANNEL
81 select PNFS_FILE_LAYOUT 81 select PNFS_FILE_LAYOUT
82 select PNFS_BLOCK
83 select MD
84 select BLK_DEV_DM
82 help 85 help
83 This option enables support for minor version 1 of the NFSv4 protocol 86 This option enables support for minor version 1 of the NFSv4 protocol
84 (RFC 5661) in the kernel's NFS client. 87 (RFC 5661 and RFC 5663) in the kernel's NFS client.
85 88
86 If unsure, say N. 89 If unsure, say N.
87 90
88config PNFS_FILE_LAYOUT 91config PNFS_FILE_LAYOUT
89 tristate 92 tristate
90 93
94config PNFS_BLOCK
95 tristate
96
91config PNFS_OBJLAYOUT 97config PNFS_OBJLAYOUT
92 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" 98 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
93 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD 99 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6a34f7dd0e6f..b58613d0abb3 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
24 24
25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
26obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 000000000000..d5815505c020
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS block layout driver kernel module
3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 000000000000..e56564d2ef95
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1019 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/module.h>
34#include <linux/init.h>
35#include <linux/mount.h>
36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
39
40#include "blocklayout.h"
41
42#define NFSDBG_FACILITY NFSDBG_PNFS_LD
43
44MODULE_LICENSE("GPL");
45MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
46MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
47
48struct dentry *bl_device_pipe;
49wait_queue_head_t bl_wq;
50
51static void print_page(struct page *page)
52{
53 dprintk("PRINTPAGE page %p\n", page);
54 dprintk(" PagePrivate %d\n", PagePrivate(page));
55 dprintk(" PageUptodate %d\n", PageUptodate(page));
56 dprintk(" PageError %d\n", PageError(page));
57 dprintk(" PageDirty %d\n", PageDirty(page));
58 dprintk(" PageReferenced %d\n", PageReferenced(page));
59 dprintk(" PageLocked %d\n", PageLocked(page));
60 dprintk(" PageWriteback %d\n", PageWriteback(page));
61 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
62 dprintk("\n");
63}
64
65/* Given the be associated with isect, determine if page data needs to be
66 * initialized.
67 */
68static int is_hole(struct pnfs_block_extent *be, sector_t isect)
69{
70 if (be->be_state == PNFS_BLOCK_NONE_DATA)
71 return 1;
72 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
73 return 0;
74 else
75 return !bl_is_sector_init(be->be_inval, isect);
76}
77
78/* Given the be associated with isect, determine if page data can be
79 * written to disk.
80 */
81static int is_writable(struct pnfs_block_extent *be, sector_t isect)
82{
83 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
84 be->be_state == PNFS_BLOCK_INVALID_DATA);
85}
86
87/* The data we are handed might be spread across several bios. We need
88 * to track when the last one is finished.
89 */
90struct parallel_io {
91 struct kref refcnt;
92 struct rpc_call_ops call_ops;
93 void (*pnfs_callback) (void *data);
94 void *data;
95};
96
97static inline struct parallel_io *alloc_parallel(void *data)
98{
99 struct parallel_io *rv;
100
101 rv = kmalloc(sizeof(*rv), GFP_NOFS);
102 if (rv) {
103 rv->data = data;
104 kref_init(&rv->refcnt);
105 }
106 return rv;
107}
108
109static inline void get_parallel(struct parallel_io *p)
110{
111 kref_get(&p->refcnt);
112}
113
114static void destroy_parallel(struct kref *kref)
115{
116 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
117
118 dprintk("%s enter\n", __func__);
119 p->pnfs_callback(p->data);
120 kfree(p);
121}
122
123static inline void put_parallel(struct parallel_io *p)
124{
125 kref_put(&p->refcnt, destroy_parallel);
126}
127
128static struct bio *
129bl_submit_bio(int rw, struct bio *bio)
130{
131 if (bio) {
132 get_parallel(bio->bi_private);
133 dprintk("%s submitting %s bio %u@%llu\n", __func__,
134 rw == READ ? "read" : "write",
135 bio->bi_size, (unsigned long long)bio->bi_sector);
136 submit_bio(rw, bio);
137 }
138 return NULL;
139}
140
141static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
142 struct pnfs_block_extent *be,
143 void (*end_io)(struct bio *, int err),
144 struct parallel_io *par)
145{
146 struct bio *bio;
147
148 bio = bio_alloc(GFP_NOIO, npg);
149 if (!bio)
150 return NULL;
151
152 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
153 bio->bi_bdev = be->be_mdev;
154 bio->bi_end_io = end_io;
155 bio->bi_private = par;
156 return bio;
157}
158
159static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
160 sector_t isect, struct page *page,
161 struct pnfs_block_extent *be,
162 void (*end_io)(struct bio *, int err),
163 struct parallel_io *par)
164{
165retry:
166 if (!bio) {
167 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
168 if (!bio)
169 return ERR_PTR(-ENOMEM);
170 }
171 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
172 bio = bl_submit_bio(rw, bio);
173 goto retry;
174 }
175 return bio;
176}
177
178static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
179{
180 if (lseg->pls_range.iomode == IOMODE_RW) {
181 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
182 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
183 } else {
184 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
185 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
186 }
187}
188
189/* This is basically copied from mpage_end_io_read */
190static void bl_end_io_read(struct bio *bio, int err)
191{
192 struct parallel_io *par = bio->bi_private;
193 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
194 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
195 struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
196
197 do {
198 struct page *page = bvec->bv_page;
199
200 if (--bvec >= bio->bi_io_vec)
201 prefetchw(&bvec->bv_page->flags);
202 if (uptodate)
203 SetPageUptodate(page);
204 } while (bvec >= bio->bi_io_vec);
205 if (!uptodate) {
206 if (!rdata->pnfs_error)
207 rdata->pnfs_error = -EIO;
208 bl_set_lo_fail(rdata->lseg);
209 }
210 bio_put(bio);
211 put_parallel(par);
212}
213
214static void bl_read_cleanup(struct work_struct *work)
215{
216 struct rpc_task *task;
217 struct nfs_read_data *rdata;
218 dprintk("%s enter\n", __func__);
219 task = container_of(work, struct rpc_task, u.tk_work);
220 rdata = container_of(task, struct nfs_read_data, task);
221 pnfs_ld_read_done(rdata);
222}
223
224static void
225bl_end_par_io_read(void *data)
226{
227 struct nfs_read_data *rdata = data;
228
229 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
230 schedule_work(&rdata->task.u.tk_work);
231}
232
233/* We don't want normal .rpc_call_done callback used, so we replace it
234 * with this stub.
235 */
236static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
237{
238 return;
239}
240
241static enum pnfs_try_status
242bl_read_pagelist(struct nfs_read_data *rdata)
243{
244 int i, hole;
245 struct bio *bio = NULL;
246 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
247 sector_t isect, extent_length = 0;
248 struct parallel_io *par;
249 loff_t f_offset = rdata->args.offset;
250 size_t count = rdata->args.count;
251 struct page **pages = rdata->args.pages;
252 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
253
254 dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
255 rdata->npages, f_offset, count);
256
257 par = alloc_parallel(rdata);
258 if (!par)
259 goto use_mds;
260 par->call_ops = *rdata->mds_ops;
261 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
262 par->pnfs_callback = bl_end_par_io_read;
263 /* At this point, we can no longer jump to use_mds */
264
265 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
266 /* Code assumes extents are page-aligned */
267 for (i = pg_index; i < rdata->npages; i++) {
268 if (!extent_length) {
269 /* We've used up the previous extent */
270 bl_put_extent(be);
271 bl_put_extent(cow_read);
272 bio = bl_submit_bio(READ, bio);
273 /* Get the next one */
274 be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
275 isect, &cow_read);
276 if (!be) {
277 rdata->pnfs_error = -EIO;
278 goto out;
279 }
280 extent_length = be->be_length -
281 (isect - be->be_f_offset);
282 if (cow_read) {
283 sector_t cow_length = cow_read->be_length -
284 (isect - cow_read->be_f_offset);
285 extent_length = min(extent_length, cow_length);
286 }
287 }
288 hole = is_hole(be, isect);
289 if (hole && !cow_read) {
290 bio = bl_submit_bio(READ, bio);
291 /* Fill hole w/ zeroes w/o accessing device */
292 dprintk("%s Zeroing page for hole\n", __func__);
293 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
294 print_page(pages[i]);
295 SetPageUptodate(pages[i]);
296 } else {
297 struct pnfs_block_extent *be_read;
298
299 be_read = (hole && cow_read) ? cow_read : be;
300 bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
301 isect, pages[i], be_read,
302 bl_end_io_read, par);
303 if (IS_ERR(bio)) {
304 rdata->pnfs_error = PTR_ERR(bio);
305 goto out;
306 }
307 }
308 isect += PAGE_CACHE_SECTORS;
309 extent_length -= PAGE_CACHE_SECTORS;
310 }
311 if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
312 rdata->res.eof = 1;
313 rdata->res.count = rdata->inode->i_size - f_offset;
314 } else {
315 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
316 }
317out:
318 bl_put_extent(be);
319 bl_put_extent(cow_read);
320 bl_submit_bio(READ, bio);
321 put_parallel(par);
322 return PNFS_ATTEMPTED;
323
324 use_mds:
325 dprintk("Giving up and using normal NFS\n");
326 return PNFS_NOT_ATTEMPTED;
327}
328
329static void mark_extents_written(struct pnfs_block_layout *bl,
330 __u64 offset, __u32 count)
331{
332 sector_t isect, end;
333 struct pnfs_block_extent *be;
334
335 dprintk("%s(%llu, %u)\n", __func__, offset, count);
336 if (count == 0)
337 return;
338 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
339 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
340 end >>= SECTOR_SHIFT;
341 while (isect < end) {
342 sector_t len;
343 be = bl_find_get_extent(bl, isect, NULL);
344 BUG_ON(!be); /* FIXME */
345 len = min(end, be->be_f_offset + be->be_length) - isect;
346 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
347 bl_mark_for_commit(be, isect, len); /* What if fails? */
348 isect += len;
349 bl_put_extent(be);
350 }
351}
352
353static void bl_end_io_write_zero(struct bio *bio, int err)
354{
355 struct parallel_io *par = bio->bi_private;
356 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
357 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
358 struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
359
360 do {
361 struct page *page = bvec->bv_page;
362
363 if (--bvec >= bio->bi_io_vec)
364 prefetchw(&bvec->bv_page->flags);
365 /* This is the zeroing page we added */
366 end_page_writeback(page);
367 page_cache_release(page);
368 } while (bvec >= bio->bi_io_vec);
369 if (!uptodate) {
370 if (!wdata->pnfs_error)
371 wdata->pnfs_error = -EIO;
372 bl_set_lo_fail(wdata->lseg);
373 }
374 bio_put(bio);
375 put_parallel(par);
376}
377
378/* This is basically copied from mpage_end_io_read */
379static void bl_end_io_write(struct bio *bio, int err)
380{
381 struct parallel_io *par = bio->bi_private;
382 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
383 struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
384
385 if (!uptodate) {
386 if (!wdata->pnfs_error)
387 wdata->pnfs_error = -EIO;
388 bl_set_lo_fail(wdata->lseg);
389 }
390 bio_put(bio);
391 put_parallel(par);
392}
393
394/* Function scheduled for call during bl_end_par_io_write,
395 * it marks sectors as written and extends the commitlist.
396 */
397static void bl_write_cleanup(struct work_struct *work)
398{
399 struct rpc_task *task;
400 struct nfs_write_data *wdata;
401 dprintk("%s enter\n", __func__);
402 task = container_of(work, struct rpc_task, u.tk_work);
403 wdata = container_of(task, struct nfs_write_data, task);
404 if (!wdata->pnfs_error) {
405 /* Marks for LAYOUTCOMMIT */
406 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
407 wdata->args.offset, wdata->args.count);
408 }
409 pnfs_ld_write_done(wdata);
410}
411
412/* Called when last of bios associated with a bl_write_pagelist call finishes */
413static void bl_end_par_io_write(void *data)
414{
415 struct nfs_write_data *wdata = data;
416
417 wdata->task.tk_status = 0;
418 wdata->verf.committed = NFS_FILE_SYNC;
419 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
420 schedule_work(&wdata->task.u.tk_work);
421}
422
423/* FIXME STUB - mark intersection of layout and page as bad, so is not
424 * used again.
425 */
426static void mark_bad_read(void)
427{
428 return;
429}
430
431/*
432 * map_block: map a requested I/0 block (isect) into an offset in the LVM
433 * block_device
434 */
435static void
436map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
437{
438 dprintk("%s enter be=%p\n", __func__, be);
439
440 set_buffer_mapped(bh);
441 bh->b_bdev = be->be_mdev;
442 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
443 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
444
445 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
446 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
447 bh->b_size);
448 return;
449}
450
451/* Given an unmapped page, zero it or read in page for COW, page is locked
452 * by caller.
453 */
454static int
455init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
456{
457 struct buffer_head *bh = NULL;
458 int ret = 0;
459 sector_t isect;
460
461 dprintk("%s enter, %p\n", __func__, page);
462 BUG_ON(PageUptodate(page));
463 if (!cow_read) {
464 zero_user_segment(page, 0, PAGE_SIZE);
465 SetPageUptodate(page);
466 goto cleanup;
467 }
468
469 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
470 if (!bh) {
471 ret = -ENOMEM;
472 goto cleanup;
473 }
474
475 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
476 map_block(bh, isect, cow_read);
477 if (!bh_uptodate_or_lock(bh))
478 ret = bh_submit_read(bh);
479 if (ret)
480 goto cleanup;
481 SetPageUptodate(page);
482
483cleanup:
484 bl_put_extent(cow_read);
485 if (bh)
486 free_buffer_head(bh);
487 if (ret) {
488 /* Need to mark layout with bad read...should now
489 * just use nfs4 for reads and writes.
490 */
491 mark_bad_read();
492 }
493 return ret;
494}
495
496static enum pnfs_try_status
497bl_write_pagelist(struct nfs_write_data *wdata, int sync)
498{
499 int i, ret, npg_zero, pg_index, last = 0;
500 struct bio *bio = NULL;
501 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
502 sector_t isect, last_isect = 0, extent_length = 0;
503 struct parallel_io *par;
504 loff_t offset = wdata->args.offset;
505 size_t count = wdata->args.count;
506 struct page **pages = wdata->args.pages;
507 struct page *page;
508 pgoff_t index;
509 u64 temp;
510 int npg_per_block =
511 NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
512
513 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
514 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
515 * We want to write each, and if there is an error set pnfs_error
516 * to have it redone using nfs.
517 */
518 par = alloc_parallel(wdata);
519 if (!par)
520 return PNFS_NOT_ATTEMPTED;
521 par->call_ops = *wdata->mds_ops;
522 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
523 par->pnfs_callback = bl_end_par_io_write;
524 /* At this point, have to be more careful with error handling */
525
526 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
527 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
528 if (!be || !is_writable(be, isect)) {
529 dprintk("%s no matching extents!\n", __func__);
530 wdata->pnfs_error = -EINVAL;
531 goto out;
532 }
533
534 /* First page inside INVALID extent */
535 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
536 temp = offset >> PAGE_CACHE_SHIFT;
537 npg_zero = do_div(temp, npg_per_block);
538 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
539 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
540 extent_length = be->be_length - (isect - be->be_f_offset);
541
542fill_invalid_ext:
543 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
544 for (;npg_zero > 0; npg_zero--) {
545 /* page ref released in bl_end_io_write_zero */
546 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
547 dprintk("%s zero %dth page: index %lu isect %llu\n",
548 __func__, npg_zero, index,
549 (unsigned long long)isect);
550 page =
551 find_or_create_page(wdata->inode->i_mapping, index,
552 GFP_NOFS);
553 if (!page) {
554 dprintk("%s oom\n", __func__);
555 wdata->pnfs_error = -ENOMEM;
556 goto out;
557 }
558
559 /* PageDirty: Other will write this out
560 * PageWriteback: Other is writing this out
561 * PageUptodate: It was read before
562 * sector_initialized: already written out
563 */
564 if (PageDirty(page) || PageWriteback(page) ||
565 bl_is_sector_init(be->be_inval, isect)) {
566 print_page(page);
567 unlock_page(page);
568 page_cache_release(page);
569 goto next_page;
570 }
571 if (!PageUptodate(page)) {
572 /* New page, readin or zero it */
573 init_page_for_write(page, cow_read);
574 }
575 set_page_writeback(page);
576 unlock_page(page);
577
578 ret = bl_mark_sectors_init(be->be_inval, isect,
579 PAGE_CACHE_SECTORS,
580 NULL);
581 if (unlikely(ret)) {
582 dprintk("%s bl_mark_sectors_init fail %d\n",
583 __func__, ret);
584 end_page_writeback(page);
585 page_cache_release(page);
586 wdata->pnfs_error = ret;
587 goto out;
588 }
589 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
590 isect, page, be,
591 bl_end_io_write_zero, par);
592 if (IS_ERR(bio)) {
593 wdata->pnfs_error = PTR_ERR(bio);
594 goto out;
595 }
596 /* FIXME: This should be done in bi_end_io */
597 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
598 page->index << PAGE_CACHE_SHIFT,
599 PAGE_CACHE_SIZE);
600next_page:
601 isect += PAGE_CACHE_SECTORS;
602 extent_length -= PAGE_CACHE_SECTORS;
603 }
604 if (last)
605 goto write_done;
606 }
607 bio = bl_submit_bio(WRITE, bio);
608
609 /* Middle pages */
610 pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
611 for (i = pg_index; i < wdata->npages; i++) {
612 if (!extent_length) {
613 /* We've used up the previous extent */
614 bl_put_extent(be);
615 bio = bl_submit_bio(WRITE, bio);
616 /* Get the next one */
617 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
618 isect, NULL);
619 if (!be || !is_writable(be, isect)) {
620 wdata->pnfs_error = -EINVAL;
621 goto out;
622 }
623 extent_length = be->be_length -
624 (isect - be->be_f_offset);
625 }
626 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
627 ret = bl_mark_sectors_init(be->be_inval, isect,
628 PAGE_CACHE_SECTORS,
629 NULL);
630 if (unlikely(ret)) {
631 dprintk("%s bl_mark_sectors_init fail %d\n",
632 __func__, ret);
633 wdata->pnfs_error = ret;
634 goto out;
635 }
636 }
637 bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
638 isect, pages[i], be,
639 bl_end_io_write, par);
640 if (IS_ERR(bio)) {
641 wdata->pnfs_error = PTR_ERR(bio);
642 goto out;
643 }
644 isect += PAGE_CACHE_SECTORS;
645 last_isect = isect;
646 extent_length -= PAGE_CACHE_SECTORS;
647 }
648
649 /* Last page inside INVALID extent */
650 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
651 bio = bl_submit_bio(WRITE, bio);
652 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
653 npg_zero = npg_per_block - do_div(temp, npg_per_block);
654 if (npg_zero < npg_per_block) {
655 last = 1;
656 goto fill_invalid_ext;
657 }
658 }
659
660write_done:
661 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
662 if (count < wdata->res.count) {
663 wdata->res.count = count;
664 }
665out:
666 bl_put_extent(be);
667 bl_submit_bio(WRITE, bio);
668 put_parallel(par);
669 return PNFS_ATTEMPTED;
670}
671
672/* FIXME - range ignored */
673static void
674release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
675{
676 int i;
677 struct pnfs_block_extent *be;
678
679 spin_lock(&bl->bl_ext_lock);
680 for (i = 0; i < EXTENT_LISTS; i++) {
681 while (!list_empty(&bl->bl_extents[i])) {
682 be = list_first_entry(&bl->bl_extents[i],
683 struct pnfs_block_extent,
684 be_node);
685 list_del(&be->be_node);
686 bl_put_extent(be);
687 }
688 }
689 spin_unlock(&bl->bl_ext_lock);
690}
691
692static void
693release_inval_marks(struct pnfs_inval_markings *marks)
694{
695 struct pnfs_inval_tracking *pos, *temp;
696
697 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
698 list_del(&pos->it_link);
699 kfree(pos);
700 }
701 return;
702}
703
704static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
705{
706 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
707
708 dprintk("%s enter\n", __func__);
709 release_extents(bl, NULL);
710 release_inval_marks(&bl->bl_inval);
711 kfree(bl);
712}
713
714static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
715 gfp_t gfp_flags)
716{
717 struct pnfs_block_layout *bl;
718
719 dprintk("%s enter\n", __func__);
720 bl = kzalloc(sizeof(*bl), gfp_flags);
721 if (!bl)
722 return NULL;
723 spin_lock_init(&bl->bl_ext_lock);
724 INIT_LIST_HEAD(&bl->bl_extents[0]);
725 INIT_LIST_HEAD(&bl->bl_extents[1]);
726 INIT_LIST_HEAD(&bl->bl_commit);
727 INIT_LIST_HEAD(&bl->bl_committing);
728 bl->bl_count = 0;
729 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
730 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
731 return &bl->bl_layout;
732}
733
734static void bl_free_lseg(struct pnfs_layout_segment *lseg)
735{
736 dprintk("%s enter\n", __func__);
737 kfree(lseg);
738}
739
740/* We pretty much ignore lseg, and store all data layout wide, so we
741 * can correctly merge.
742 */
743static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
744 struct nfs4_layoutget_res *lgr,
745 gfp_t gfp_flags)
746{
747 struct pnfs_layout_segment *lseg;
748 int status;
749
750 dprintk("%s enter\n", __func__);
751 lseg = kzalloc(sizeof(*lseg), gfp_flags);
752 if (!lseg)
753 return ERR_PTR(-ENOMEM);
754 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
755 if (status) {
756 /* We don't want to call the full-blown bl_free_lseg,
757 * since on error extents were not touched.
758 */
759 kfree(lseg);
760 return ERR_PTR(status);
761 }
762 return lseg;
763}
764
765static void
766bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
767 const struct nfs4_layoutcommit_args *arg)
768{
769 dprintk("%s enter\n", __func__);
770 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
771}
772
773static void
774bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
775{
776 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
777
778 dprintk("%s enter\n", __func__);
779 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
780}
781
782static void free_blk_mountid(struct block_mount_id *mid)
783{
784 if (mid) {
785 struct pnfs_block_dev *dev;
786 spin_lock(&mid->bm_lock);
787 while (!list_empty(&mid->bm_devlist)) {
788 dev = list_first_entry(&mid->bm_devlist,
789 struct pnfs_block_dev,
790 bm_node);
791 list_del(&dev->bm_node);
792 bl_free_block_dev(dev);
793 }
794 spin_unlock(&mid->bm_lock);
795 kfree(mid);
796 }
797}
798
799/* This is mostly copied from the filelayout's get_device_info function.
800 * It seems much of this should be at the generic pnfs level.
801 */
802static struct pnfs_block_dev *
803nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
804 struct nfs4_deviceid *d_id)
805{
806 struct pnfs_device *dev;
807 struct pnfs_block_dev *rv = NULL;
808 u32 max_resp_sz;
809 int max_pages;
810 struct page **pages = NULL;
811 int i, rc;
812
813 /*
814 * Use the session max response size as the basis for setting
815 * GETDEVICEINFO's maxcount
816 */
817 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
818 max_pages = max_resp_sz >> PAGE_SHIFT;
819 dprintk("%s max_resp_sz %u max_pages %d\n",
820 __func__, max_resp_sz, max_pages);
821
822 dev = kmalloc(sizeof(*dev), GFP_NOFS);
823 if (!dev) {
824 dprintk("%s kmalloc failed\n", __func__);
825 return NULL;
826 }
827
828 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
829 if (pages == NULL) {
830 kfree(dev);
831 return NULL;
832 }
833 for (i = 0; i < max_pages; i++) {
834 pages[i] = alloc_page(GFP_NOFS);
835 if (!pages[i])
836 goto out_free;
837 }
838
839 memcpy(&dev->dev_id, d_id, sizeof(*d_id));
840 dev->layout_type = LAYOUT_BLOCK_VOLUME;
841 dev->pages = pages;
842 dev->pgbase = 0;
843 dev->pglen = PAGE_SIZE * max_pages;
844 dev->mincount = 0;
845
846 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
847 rc = nfs4_proc_getdeviceinfo(server, dev);
848 dprintk("%s getdevice info returns %d\n", __func__, rc);
849 if (rc)
850 goto out_free;
851
852 rv = nfs4_blk_decode_device(server, dev);
853 out_free:
854 for (i = 0; i < max_pages; i++)
855 __free_page(pages[i]);
856 kfree(pages);
857 kfree(dev);
858 return rv;
859}
860
861static int
862bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
863{
864 struct block_mount_id *b_mt_id = NULL;
865 struct pnfs_devicelist *dlist = NULL;
866 struct pnfs_block_dev *bdev;
867 LIST_HEAD(block_disklist);
868 int status = 0, i;
869
870 dprintk("%s enter\n", __func__);
871
872 if (server->pnfs_blksize == 0) {
873 dprintk("%s Server did not return blksize\n", __func__);
874 return -EINVAL;
875 }
876 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
877 if (!b_mt_id) {
878 status = -ENOMEM;
879 goto out_error;
880 }
881 /* Initialize nfs4 block layout mount id */
882 spin_lock_init(&b_mt_id->bm_lock);
883 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
884
885 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
886 if (!dlist) {
887 status = -ENOMEM;
888 goto out_error;
889 }
890 dlist->eof = 0;
891 while (!dlist->eof) {
892 status = nfs4_proc_getdevicelist(server, fh, dlist);
893 if (status)
894 goto out_error;
895 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
896 __func__, dlist->num_devs, dlist->eof);
897 for (i = 0; i < dlist->num_devs; i++) {
898 bdev = nfs4_blk_get_deviceinfo(server, fh,
899 &dlist->dev_id[i]);
900 if (!bdev) {
901 status = -ENODEV;
902 goto out_error;
903 }
904 spin_lock(&b_mt_id->bm_lock);
905 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
906 spin_unlock(&b_mt_id->bm_lock);
907 }
908 }
909 dprintk("%s SUCCESS\n", __func__);
910 server->pnfs_ld_data = b_mt_id;
911
912 out_return:
913 kfree(dlist);
914 return status;
915
916 out_error:
917 free_blk_mountid(b_mt_id);
918 goto out_return;
919}
920
921static int
922bl_clear_layoutdriver(struct nfs_server *server)
923{
924 struct block_mount_id *b_mt_id = server->pnfs_ld_data;
925
926 dprintk("%s enter\n", __func__);
927 free_blk_mountid(b_mt_id);
928 dprintk("%s RETURNS\n", __func__);
929 return 0;
930}
931
932static const struct nfs_pageio_ops bl_pg_read_ops = {
933 .pg_init = pnfs_generic_pg_init_read,
934 .pg_test = pnfs_generic_pg_test,
935 .pg_doio = pnfs_generic_pg_readpages,
936};
937
938static const struct nfs_pageio_ops bl_pg_write_ops = {
939 .pg_init = pnfs_generic_pg_init_write,
940 .pg_test = pnfs_generic_pg_test,
941 .pg_doio = pnfs_generic_pg_writepages,
942};
943
944static struct pnfs_layoutdriver_type blocklayout_type = {
945 .id = LAYOUT_BLOCK_VOLUME,
946 .name = "LAYOUT_BLOCK_VOLUME",
947 .read_pagelist = bl_read_pagelist,
948 .write_pagelist = bl_write_pagelist,
949 .alloc_layout_hdr = bl_alloc_layout_hdr,
950 .free_layout_hdr = bl_free_layout_hdr,
951 .alloc_lseg = bl_alloc_lseg,
952 .free_lseg = bl_free_lseg,
953 .encode_layoutcommit = bl_encode_layoutcommit,
954 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
955 .set_layoutdriver = bl_set_layoutdriver,
956 .clear_layoutdriver = bl_clear_layoutdriver,
957 .pg_read_ops = &bl_pg_read_ops,
958 .pg_write_ops = &bl_pg_write_ops,
959};
960
961static const struct rpc_pipe_ops bl_upcall_ops = {
962 .upcall = bl_pipe_upcall,
963 .downcall = bl_pipe_downcall,
964 .destroy_msg = bl_pipe_destroy_msg,
965};
966
967static int __init nfs4blocklayout_init(void)
968{
969 struct vfsmount *mnt;
970 struct path path;
971 int ret;
972
973 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
974
975 ret = pnfs_register_layoutdriver(&blocklayout_type);
976 if (ret)
977 goto out;
978
979 init_waitqueue_head(&bl_wq);
980
981 mnt = rpc_get_mount();
982 if (IS_ERR(mnt)) {
983 ret = PTR_ERR(mnt);
984 goto out_remove;
985 }
986
987 ret = vfs_path_lookup(mnt->mnt_root,
988 mnt,
989 NFS_PIPE_DIRNAME, 0, &path);
990 if (ret)
991 goto out_remove;
992
993 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
994 &bl_upcall_ops, 0);
995 if (IS_ERR(bl_device_pipe)) {
996 ret = PTR_ERR(bl_device_pipe);
997 goto out_remove;
998 }
999out:
1000 return ret;
1001
1002out_remove:
1003 pnfs_unregister_layoutdriver(&blocklayout_type);
1004 return ret;
1005}
1006
1007static void __exit nfs4blocklayout_exit(void)
1008{
1009 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1010 __func__);
1011
1012 pnfs_unregister_layoutdriver(&blocklayout_type);
1013 rpc_unlink(bl_device_pipe);
1014}
1015
1016MODULE_ALIAS("nfs-layouttype4-3");
1017
1018module_init(nfs4blocklayout_init);
1019module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 000000000000..f27d827960a3
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,207 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
33#define FS_NFS_NFS4BLOCKLAYOUT_H
34
35#include <linux/device-mapper.h>
36#include <linux/nfs_fs.h>
37#include <linux/sunrpc/rpc_pipe_fs.h>
38
39#include "../pnfs.h"
40
41#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
42#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
43
44struct block_mount_id {
45 spinlock_t bm_lock; /* protects list */
46 struct list_head bm_devlist; /* holds pnfs_block_dev */
47};
48
49struct pnfs_block_dev {
50 struct list_head bm_node;
51 struct nfs4_deviceid bm_mdevid; /* associated devid */
52 struct block_device *bm_mdev; /* meta device itself */
53};
54
55enum exstate4 {
56 PNFS_BLOCK_READWRITE_DATA = 0,
57 PNFS_BLOCK_READ_DATA = 1,
58 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
59 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
60};
61
62#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
63
64struct my_tree {
65 sector_t mtt_step_size; /* Internal sector alignment */
66 struct list_head mtt_stub; /* Should be a radix tree */
67};
68
69struct pnfs_inval_markings {
70 spinlock_t im_lock;
71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
72 sector_t im_block_size; /* Server blocksize in sectors */
73};
74
75struct pnfs_inval_tracking {
76 struct list_head it_link;
77 int it_sector;
78 int it_tags;
79};
80
81/* sector_t fields are all in 512-byte sectors */
82struct pnfs_block_extent {
83 struct kref be_refcnt;
84 struct list_head be_node; /* link into lseg list */
85 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
86 struct block_device *be_mdev;
87 sector_t be_f_offset; /* the starting offset in the file */
88 sector_t be_length; /* the size of the extent */
89 sector_t be_v_offset; /* the starting offset in the volume */
90 enum exstate4 be_state; /* the state of this extent */
91 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
92};
93
94/* Shortened extent used by LAYOUTCOMMIT */
95struct pnfs_block_short_extent {
96 struct list_head bse_node;
97 struct nfs4_deviceid bse_devid;
98 struct block_device *bse_mdev;
99 sector_t bse_f_offset; /* the starting offset in the file */
100 sector_t bse_length; /* the size of the extent */
101};
102
103static inline void
104BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
105{
106 spin_lock_init(&marks->im_lock);
107 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
108 marks->im_block_size = blocksize;
109 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
110 blocksize);
111}
112
113enum extentclass4 {
114 RW_EXTENT = 0, /* READWRTE and INVAL */
115 RO_EXTENT = 1, /* READ and NONE */
116 EXTENT_LISTS = 2,
117};
118
119static inline int bl_choose_list(enum exstate4 state)
120{
121 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
122 return RO_EXTENT;
123 else
124 return RW_EXTENT;
125}
126
127struct pnfs_block_layout {
128 struct pnfs_layout_hdr bl_layout;
129 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
130 spinlock_t bl_ext_lock; /* Protects list manipulation */
131 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
132 struct list_head bl_commit; /* Needs layout commit */
133 struct list_head bl_committing; /* Layout committing */
134 unsigned int bl_count; /* entries in bl_commit */
135 sector_t bl_blocksize; /* Server blocksize in sectors */
136};
137
138#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
139
140static inline struct pnfs_block_layout *
141BLK_LO2EXT(struct pnfs_layout_hdr *lo)
142{
143 return container_of(lo, struct pnfs_block_layout, bl_layout);
144}
145
146static inline struct pnfs_block_layout *
147BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
148{
149 return BLK_LO2EXT(lseg->pls_layout);
150}
151
152struct bl_dev_msg {
153 int status;
154 uint32_t major, minor;
155};
156
157struct bl_msg_hdr {
158 u8 type;
159 u16 totallen; /* length of entire message, including hdr itself */
160};
161
162extern struct dentry *bl_device_pipe;
163extern wait_queue_head_t bl_wq;
164
165#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
166#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
167#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
168#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
169#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
170
171/* blocklayoutdev.c */
172ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
173 char __user *, size_t);
174ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
175void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
176struct block_device *nfs4_blkdev_get(dev_t dev);
177int nfs4_blkdev_put(struct block_device *bdev);
178struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
179 struct pnfs_device *dev);
180int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
181 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
182
183/* blocklayoutdm.c */
184void bl_free_block_dev(struct pnfs_block_dev *bdev);
185
186/* extents.c */
187struct pnfs_block_extent *
188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
189 struct pnfs_block_extent **cow_read);
190int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
191 sector_t offset, sector_t length,
192 sector_t **pages);
193void bl_put_extent(struct pnfs_block_extent *be);
194struct pnfs_block_extent *bl_alloc_extent(void);
195int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
196int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
197 struct xdr_stream *xdr,
198 const struct nfs4_layoutcommit_args *arg);
199void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
200 const struct nfs4_layoutcommit_args *arg,
201 int status);
202int bl_add_merge_extent(struct pnfs_block_layout *bl,
203 struct pnfs_block_extent *new);
204int bl_mark_for_commit(struct pnfs_block_extent *be,
205 sector_t offset, sector_t length);
206
207#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644
index 000000000000..a83b393fb01c
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -0,0 +1,410 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
3 *
4 * Device operations for the pnfs nfs4 file layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#include <linux/module.h>
33#include <linux/buffer_head.h> /* __bread */
34
35#include <linux/genhd.h>
36#include <linux/blkdev.h>
37#include <linux/hash.h>
38
39#include "blocklayout.h"
40
41#define NFSDBG_FACILITY NFSDBG_PNFS_LD
42
43static int decode_sector_number(__be32 **rp, sector_t *sp)
44{
45 uint64_t s;
46
47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) {
49 printk(KERN_WARNING "%s: sector not aligned\n", __func__);
50 return -1;
51 }
52 *sp = s >> SECTOR_SHIFT;
53 return 0;
54}
55
56/* Open a block_device by device number. */
57struct block_device *nfs4_blkdev_get(dev_t dev)
58{
59 struct block_device *bd;
60
61 dprintk("%s enter\n", __func__);
62 bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
63 if (IS_ERR(bd))
64 goto fail;
65 return bd;
66fail:
67 dprintk("%s failed to open device : %ld\n",
68 __func__, PTR_ERR(bd));
69 return NULL;
70}
71
72/*
73 * Release the block device
74 */
75int nfs4_blkdev_put(struct block_device *bdev)
76{
77 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
78 MINOR(bdev->bd_dev));
79 return blkdev_put(bdev, FMODE_READ);
80}
81
82/*
83 * Shouldn't there be a rpc_generic_upcall() to do this for us?
84 */
85ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
86 char __user *dst, size_t buflen)
87{
88 char *data = (char *)msg->data + msg->copied;
89 size_t mlen = min(msg->len - msg->copied, buflen);
90 unsigned long left;
91
92 left = copy_to_user(dst, data, mlen);
93 if (left == mlen) {
94 msg->errno = -EFAULT;
95 return -EFAULT;
96 }
97
98 mlen -= left;
99 msg->copied += mlen;
100 msg->errno = 0;
101 return mlen;
102}
103
104static struct bl_dev_msg bl_mount_reply;
105
106ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
107 size_t mlen)
108{
109 if (mlen != sizeof (struct bl_dev_msg))
110 return -EINVAL;
111
112 if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
113 return -EFAULT;
114
115 wake_up(&bl_wq);
116
117 return mlen;
118}
119
120void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
121{
122 if (msg->errno >= 0)
123 return;
124 wake_up(&bl_wq);
125}
126
127/*
128 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
129 */
130struct pnfs_block_dev *
131nfs4_blk_decode_device(struct nfs_server *server,
132 struct pnfs_device *dev)
133{
134 struct pnfs_block_dev *rv = NULL;
135 struct block_device *bd = NULL;
136 struct rpc_pipe_msg msg;
137 struct bl_msg_hdr bl_msg = {
138 .type = BL_DEVICE_MOUNT,
139 .totallen = dev->mincount,
140 };
141 uint8_t *dataptr;
142 DECLARE_WAITQUEUE(wq, current);
143 struct bl_dev_msg *reply = &bl_mount_reply;
144 int offset, len, i;
145
146 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
147 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
148 dev->mincount);
149
150 memset(&msg, 0, sizeof(msg));
151 msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
152 if (!msg.data) {
153 rv = ERR_PTR(-ENOMEM);
154 goto out;
155 }
156
157 memcpy(msg.data, &bl_msg, sizeof(bl_msg));
158 dataptr = (uint8_t *) msg.data;
159 len = dev->mincount;
160 offset = sizeof(bl_msg);
161 for (i = 0; len > 0; i++) {
162 memcpy(&dataptr[offset], page_address(dev->pages[i]),
163 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
164 len -= PAGE_CACHE_SIZE;
165 offset += PAGE_CACHE_SIZE;
166 }
167 msg.len = sizeof(bl_msg) + dev->mincount;
168
169 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
170 add_wait_queue(&bl_wq, &wq);
171 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
172 remove_wait_queue(&bl_wq, &wq);
173 goto out;
174 }
175
176 set_current_state(TASK_UNINTERRUPTIBLE);
177 schedule();
178 __set_current_state(TASK_RUNNING);
179 remove_wait_queue(&bl_wq, &wq);
180
181 if (reply->status != BL_DEVICE_REQUEST_PROC) {
182 dprintk("%s failed to open device: %d\n",
183 __func__, reply->status);
184 rv = ERR_PTR(-EINVAL);
185 goto out;
186 }
187
188 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
189 if (IS_ERR(bd)) {
190 dprintk("%s failed to open device : %ld\n",
191 __func__, PTR_ERR(bd));
192 goto out;
193 }
194
195 rv = kzalloc(sizeof(*rv), GFP_NOFS);
196 if (!rv) {
197 rv = ERR_PTR(-ENOMEM);
198 goto out;
199 }
200
201 rv->bm_mdev = bd;
202 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
203 dprintk("%s Created device %s with bd_block_size %u\n",
204 __func__,
205 bd->bd_disk->disk_name,
206 bd->bd_block_size);
207
208out:
209 kfree(msg.data);
210 return rv;
211}
212
213/* Map deviceid returned by the server to constructed block_device */
214static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
215 struct nfs4_deviceid *id)
216{
217 struct block_device *rv = NULL;
218 struct block_mount_id *mid;
219 struct pnfs_block_dev *dev;
220
221 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
222 mid = BLK_ID(lo);
223 spin_lock(&mid->bm_lock);
224 list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
225 if (memcmp(id->data, dev->bm_mdevid.data,
226 NFS4_DEVICEID4_SIZE) == 0) {
227 rv = dev->bm_mdev;
228 goto out;
229 }
230 }
231 out:
232 spin_unlock(&mid->bm_lock);
233 dprintk("%s returning %p\n", __func__, rv);
234 return rv;
235}
236
237/* Tracks info needed to ensure extents in layout obey constraints of spec */
238struct layout_verification {
239 u32 mode; /* R or RW */
240 u64 start; /* Expected start of next non-COW extent */
241 u64 inval; /* Start of INVAL coverage */
242 u64 cowread; /* End of COW read coverage */
243};
244
245/* Verify the extent meets the layout requirements of the pnfs-block draft,
246 * section 2.3.1.
247 */
248static int verify_extent(struct pnfs_block_extent *be,
249 struct layout_verification *lv)
250{
251 if (lv->mode == IOMODE_READ) {
252 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
253 be->be_state == PNFS_BLOCK_INVALID_DATA)
254 return -EIO;
255 if (be->be_f_offset != lv->start)
256 return -EIO;
257 lv->start += be->be_length;
258 return 0;
259 }
260 /* lv->mode == IOMODE_RW */
261 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
262 if (be->be_f_offset != lv->start)
263 return -EIO;
264 if (lv->cowread > lv->start)
265 return -EIO;
266 lv->start += be->be_length;
267 lv->inval = lv->start;
268 return 0;
269 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
270 if (be->be_f_offset != lv->start)
271 return -EIO;
272 lv->start += be->be_length;
273 return 0;
274 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
275 if (be->be_f_offset > lv->start)
276 return -EIO;
277 if (be->be_f_offset < lv->inval)
278 return -EIO;
279 if (be->be_f_offset < lv->cowread)
280 return -EIO;
281 /* It looks like you might want to min this with lv->start,
282 * but you really don't.
283 */
284 lv->inval = lv->inval + be->be_length;
285 lv->cowread = be->be_f_offset + be->be_length;
286 return 0;
287 } else
288 return -EIO;
289}
290
291/* XDR decode pnfs_block_layout4 structure */
292int
293nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
294 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
295{
296 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
297 int i, status = -EIO;
298 uint32_t count;
299 struct pnfs_block_extent *be = NULL, *save;
300 struct xdr_stream stream;
301 struct xdr_buf buf;
302 struct page *scratch;
303 __be32 *p;
304 struct layout_verification lv = {
305 .mode = lgr->range.iomode,
306 .start = lgr->range.offset >> SECTOR_SHIFT,
307 .inval = lgr->range.offset >> SECTOR_SHIFT,
308 .cowread = lgr->range.offset >> SECTOR_SHIFT,
309 };
310 LIST_HEAD(extents);
311
312 dprintk("---> %s\n", __func__);
313
314 scratch = alloc_page(gfp_flags);
315 if (!scratch)
316 return -ENOMEM;
317
318 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
319 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
320
321 p = xdr_inline_decode(&stream, 4);
322 if (unlikely(!p))
323 goto out_err;
324
325 count = be32_to_cpup(p++);
326
327 dprintk("%s enter, number of extents %i\n", __func__, count);
328 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
329 if (unlikely(!p))
330 goto out_err;
331
332 /* Decode individual extents, putting them in temporary
333 * staging area until whole layout is decoded to make error
334 * recovery easier.
335 */
336 for (i = 0; i < count; i++) {
337 be = bl_alloc_extent();
338 if (!be) {
339 status = -ENOMEM;
340 goto out_err;
341 }
342 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
343 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
344 be->be_mdev = translate_devid(lo, &be->be_devid);
345 if (!be->be_mdev)
346 goto out_err;
347
348 /* The next three values are read in as bytes,
349 * but stored as 512-byte sector lengths
350 */
351 if (decode_sector_number(&p, &be->be_f_offset) < 0)
352 goto out_err;
353 if (decode_sector_number(&p, &be->be_length) < 0)
354 goto out_err;
355 if (decode_sector_number(&p, &be->be_v_offset) < 0)
356 goto out_err;
357 be->be_state = be32_to_cpup(p++);
358 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
359 be->be_inval = &bl->bl_inval;
360 if (verify_extent(be, &lv)) {
361 dprintk("%s verify failed\n", __func__);
362 goto out_err;
363 }
364 list_add_tail(&be->be_node, &extents);
365 }
366 if (lgr->range.offset + lgr->range.length !=
367 lv.start << SECTOR_SHIFT) {
368 dprintk("%s Final length mismatch\n", __func__);
369 be = NULL;
370 goto out_err;
371 }
372 if (lv.start < lv.cowread) {
373 dprintk("%s Final uncovered COW extent\n", __func__);
374 be = NULL;
375 goto out_err;
376 }
377 /* Extents decoded properly, now try to merge them in to
378 * existing layout extents.
379 */
380 spin_lock(&bl->bl_ext_lock);
381 list_for_each_entry_safe(be, save, &extents, be_node) {
382 list_del(&be->be_node);
383 status = bl_add_merge_extent(bl, be);
384 if (status) {
385 spin_unlock(&bl->bl_ext_lock);
386 /* This is a fairly catastrophic error, as the
387 * entire layout extent lists are now corrupted.
388 * We should have some way to distinguish this.
389 */
390 be = NULL;
391 goto out_err;
392 }
393 }
394 spin_unlock(&bl->bl_ext_lock);
395 status = 0;
396 out:
397 __free_page(scratch);
398 dprintk("%s returns %i\n", __func__, status);
399 return status;
400
401 out_err:
402 bl_put_extent(be);
403 while (!list_empty(&extents)) {
404 be = list_first_entry(&extents, struct pnfs_block_extent,
405 be_node);
406 list_del(&be->be_node);
407 bl_put_extent(be);
408 }
409 goto out;
410}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644
index 000000000000..d055c7558073
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -0,0 +1,111 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdm.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2007 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Fred Isaman <iisaman@umich.edu>
10 * Andy Adamson <andros@citi.umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/genhd.h> /* gendisk - used in a dprintk*/
34#include <linux/sched.h>
35#include <linux/hash.h>
36
37#include "blocklayout.h"
38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40
41static void dev_remove(dev_t dev)
42{
43 struct rpc_pipe_msg msg;
44 struct bl_dev_msg bl_umount_request;
45 struct bl_msg_hdr bl_msg = {
46 .type = BL_DEVICE_UMOUNT,
47 .totallen = sizeof(bl_umount_request),
48 };
49 uint8_t *dataptr;
50 DECLARE_WAITQUEUE(wq, current);
51
52 dprintk("Entering %s\n", __func__);
53
54 memset(&msg, 0, sizeof(msg));
55 msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
56 if (!msg.data)
57 goto out;
58
59 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
60 bl_umount_request.major = MAJOR(dev);
61 bl_umount_request.minor = MINOR(dev);
62
63 memcpy(msg.data, &bl_msg, sizeof(bl_msg));
64 dataptr = (uint8_t *) msg.data;
65 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
66 msg.len = sizeof(bl_msg) + bl_msg.totallen;
67
68 add_wait_queue(&bl_wq, &wq);
69 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
70 remove_wait_queue(&bl_wq, &wq);
71 goto out;
72 }
73
74 set_current_state(TASK_UNINTERRUPTIBLE);
75 schedule();
76 __set_current_state(TASK_RUNNING);
77 remove_wait_queue(&bl_wq, &wq);
78
79out:
80 kfree(msg.data);
81}
82
83/*
84 * Release meta device
85 */
86static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
87{
88 int rv;
89
90 dprintk("%s Releasing\n", __func__);
91 rv = nfs4_blkdev_put(bdev->bm_mdev);
92 if (rv)
93 printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
94 __func__, rv);
95
96 dev_remove(bdev->bm_mdev->bd_dev);
97}
98
99void bl_free_block_dev(struct pnfs_block_dev *bdev)
100{
101 if (bdev) {
102 if (bdev->bm_mdev) {
103 dprintk("%s Removing DM device: %d:%d\n",
104 __func__,
105 MAJOR(bdev->bm_mdev->bd_dev),
106 MINOR(bdev->bm_mdev->bd_dev));
107 nfs4_blk_metadev_release(bdev);
108 }
109 kfree(bdev);
110 }
111}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644
index 000000000000..19fa7b0b8c00
--- /dev/null
+++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,935 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - do_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 if (storage)
114 new = storage;
115 else {
116 new = kmalloc(sizeof(*new), GFP_NOFS);
117 if (!new)
118 return -ENOMEM;
119 }
120 new->it_sector = s;
121 new->it_tags = (1 << tag);
122 list_add(&new->it_link, &pos->it_link);
123 return 1;
124 }
125}
126
127/* XXXX Really want option to not create */
128/* Over range, unions tag with existing entries, else creates entry with tag */
129static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
130{
131 u64 i;
132
133 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
134 for (i = normalize(s, tree->mtt_step_size); i < s + length;
135 i += tree->mtt_step_size)
136 if (_add_entry(tree, i, tag, NULL))
137 return -ENOMEM;
138 return 0;
139}
140
141/* Ensure that future operations on given range of tree will not malloc */
142static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
143{
144 u64 start, end, s;
145 int count, i, used = 0, status = -ENOMEM;
146 struct pnfs_inval_tracking **storage;
147
148 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
149 start = normalize(offset, tree->mtt_step_size);
150 end = normalize_up(offset + length, tree->mtt_step_size);
151 count = (int)(end - start) / (int)tree->mtt_step_size;
152
153 /* Pre-malloc what memory we might need */
154 storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
155 if (!storage)
156 return -ENOMEM;
157 for (i = 0; i < count; i++) {
158 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
159 GFP_NOFS);
160 if (!storage[i])
161 goto out_cleanup;
162 }
163
164 /* Now need lock - HOW??? */
165
166 for (s = start; s < end; s += tree->mtt_step_size)
167 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
168
169 /* Unlock - HOW??? */
170 status = 0;
171
172 out_cleanup:
173 for (i = used; i < count; i++) {
174 if (!storage[i])
175 break;
176 kfree(storage[i]);
177 }
178 kfree(storage);
179 return status;
180}
181
182static void set_needs_init(sector_t *array, sector_t offset)
183{
184 sector_t *p = array;
185
186 dprintk("%s enter\n", __func__);
187 if (!p)
188 return;
189 while (*p < offset)
190 p++;
191 if (*p == offset)
192 return;
193 else if (*p == ~0) {
194 *p++ = offset;
195 *p = ~0;
196 return;
197 } else {
198 sector_t *save = p;
199 dprintk("%s Adding %llu\n", __func__, (u64)offset);
200 while (*p != ~0)
201 p++;
202 p++;
203 memmove(save + 1, save, (char *)p - (char *)save);
204 *save = offset;
205 return;
206 }
207}
208
209/* We are relying on page lock to serialize this */
210int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
211{
212 int rv;
213
214 spin_lock(&marks->im_lock);
215 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
216 spin_unlock(&marks->im_lock);
217 return rv;
218}
219
220/* Assume start, end already sector aligned */
221static int
222_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
223{
224 struct pnfs_inval_tracking *pos;
225 u64 expect = 0;
226
227 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
228 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
229 if (pos->it_sector >= end)
230 continue;
231 if (!expect) {
232 if ((pos->it_sector == end - tree->mtt_step_size) &&
233 (pos->it_tags & (1 << tag))) {
234 expect = pos->it_sector - tree->mtt_step_size;
235 if (pos->it_sector < tree->mtt_step_size || expect < start)
236 return 1;
237 continue;
238 } else {
239 return 0;
240 }
241 }
242 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
243 return 0;
244 expect -= tree->mtt_step_size;
245 if (expect < start)
246 return 1;
247 }
248 return 0;
249}
250
251static int is_range_written(struct pnfs_inval_markings *marks,
252 sector_t start, sector_t end)
253{
254 int rv;
255
256 spin_lock(&marks->im_lock);
257 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
258 spin_unlock(&marks->im_lock);
259 return rv;
260}
261
262/* Marks sectors in [offest, offset_length) as having been initialized.
263 * All lengths are step-aligned, where step is min(pagesize, blocksize).
264 * Notes where partial block is initialized, and helps prepare it for
265 * complete initialization later.
266 */
267/* Currently assumes offset is page-aligned */
268int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
269 sector_t offset, sector_t length,
270 sector_t **pages)
271{
272 sector_t s, start, end;
273 sector_t *array = NULL; /* Pages to mark */
274
275 dprintk("%s(offset=%llu,len=%llu) enter\n",
276 __func__, (u64)offset, (u64)length);
277 s = max((sector_t) 3,
278 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
279 dprintk("%s set max=%llu\n", __func__, (u64)s);
280 if (pages) {
281 array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
282 if (!array)
283 goto outerr;
284 array[0] = ~0;
285 }
286
287 start = normalize(offset, marks->im_block_size);
288 end = normalize_up(offset + length, marks->im_block_size);
289 if (_preload_range(&marks->im_tree, start, end - start))
290 goto outerr;
291
292 spin_lock(&marks->im_lock);
293
294 for (s = normalize_up(start, PAGE_CACHE_SECTORS);
295 s < offset; s += PAGE_CACHE_SECTORS) {
296 dprintk("%s pre-area pages\n", __func__);
297 /* Portion of used block is not initialized */
298 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
299 set_needs_init(array, s);
300 }
301 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
302 goto out_unlock;
303 for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
304 s < end; s += PAGE_CACHE_SECTORS) {
305 dprintk("%s post-area pages\n", __func__);
306 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
307 set_needs_init(array, s);
308 }
309
310 spin_unlock(&marks->im_lock);
311
312 if (pages) {
313 if (array[0] == ~0) {
314 kfree(array);
315 *pages = NULL;
316 } else
317 *pages = array;
318 }
319 return 0;
320
321 out_unlock:
322 spin_unlock(&marks->im_lock);
323 outerr:
324 if (pages) {
325 kfree(array);
326 *pages = NULL;
327 }
328 return -ENOMEM;
329}
330
331/* Marks sectors in [offest, offset+length) as having been written to disk.
332 * All lengths should be block aligned.
333 */
334static int mark_written_sectors(struct pnfs_inval_markings *marks,
335 sector_t offset, sector_t length)
336{
337 int status;
338
339 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
340 (u64)offset, (u64)length);
341 spin_lock(&marks->im_lock);
342 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
343 spin_unlock(&marks->im_lock);
344 return status;
345}
346
347static void print_short_extent(struct pnfs_block_short_extent *be)
348{
349 dprintk("PRINT SHORT EXTENT extent %p\n", be);
350 if (be) {
351 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
352 dprintk(" be_length %llu\n", (u64)be->bse_length);
353 }
354}
355
356static void print_clist(struct list_head *list, unsigned int count)
357{
358 struct pnfs_block_short_extent *be;
359 unsigned int i = 0;
360
361 ifdebug(FACILITY) {
362 printk(KERN_DEBUG "****************\n");
363 printk(KERN_DEBUG "Extent list looks like:\n");
364 list_for_each_entry(be, list, bse_node) {
365 i++;
366 print_short_extent(be);
367 }
368 if (i != count)
369 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
370 printk(KERN_DEBUG "****************\n");
371 }
372}
373
374/* Note: In theory, we should do more checking that devid's match between
375 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
376 */
377/* Note this is very similar to bl_add_merge_extent */
378static void add_to_commitlist(struct pnfs_block_layout *bl,
379 struct pnfs_block_short_extent *new)
380{
381 struct list_head *clist = &bl->bl_commit;
382 struct pnfs_block_short_extent *old, *save;
383 sector_t end = new->bse_f_offset + new->bse_length;
384
385 dprintk("%s enter\n", __func__);
386 print_short_extent(new);
387 print_clist(clist, bl->bl_count);
388 bl->bl_count++;
389 /* Scan for proper place to insert, extending new to the left
390 * as much as possible.
391 */
392 list_for_each_entry_safe(old, save, clist, bse_node) {
393 if (new->bse_f_offset < old->bse_f_offset)
394 break;
395 if (end <= old->bse_f_offset + old->bse_length) {
396 /* Range is already in list */
397 bl->bl_count--;
398 kfree(new);
399 return;
400 } else if (new->bse_f_offset <=
401 old->bse_f_offset + old->bse_length) {
402 /* new overlaps or abuts existing be */
403 if (new->bse_mdev == old->bse_mdev) {
404 /* extend new to fully replace old */
405 new->bse_length += new->bse_f_offset -
406 old->bse_f_offset;
407 new->bse_f_offset = old->bse_f_offset;
408 list_del(&old->bse_node);
409 bl->bl_count--;
410 kfree(old);
411 }
412 }
413 }
414 /* Note that if we never hit the above break, old will not point to a
415 * valid extent. However, in that case &old->bse_node==list.
416 */
417 list_add_tail(&new->bse_node, &old->bse_node);
418 /* Scan forward for overlaps. If we find any, extend new and
419 * remove the overlapped extent.
420 */
421 old = list_prepare_entry(new, clist, bse_node);
422 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
423 if (end < old->bse_f_offset)
424 break;
425 /* new overlaps or abuts old */
426 if (new->bse_mdev == old->bse_mdev) {
427 if (end < old->bse_f_offset + old->bse_length) {
428 /* extend new to fully cover old */
429 end = old->bse_f_offset + old->bse_length;
430 new->bse_length = end - new->bse_f_offset;
431 }
432 list_del(&old->bse_node);
433 bl->bl_count--;
434 kfree(old);
435 }
436 }
437 dprintk("%s: after merging\n", __func__);
438 print_clist(clist, bl->bl_count);
439}
440
441/* Note the range described by offset, length is guaranteed to be contained
442 * within be.
443 */
444int bl_mark_for_commit(struct pnfs_block_extent *be,
445 sector_t offset, sector_t length)
446{
447 sector_t new_end, end = offset + length;
448 struct pnfs_block_short_extent *new;
449 struct pnfs_block_layout *bl = container_of(be->be_inval,
450 struct pnfs_block_layout,
451 bl_inval);
452
453 new = kmalloc(sizeof(*new), GFP_NOFS);
454 if (!new)
455 return -ENOMEM;
456
457 mark_written_sectors(be->be_inval, offset, length);
458 /* We want to add the range to commit list, but it must be
459 * block-normalized, and verified that the normalized range has
460 * been entirely written to disk.
461 */
462 new->bse_f_offset = offset;
463 offset = normalize(offset, bl->bl_blocksize);
464 if (offset < new->bse_f_offset) {
465 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
466 new->bse_f_offset = offset;
467 else
468 new->bse_f_offset = offset + bl->bl_blocksize;
469 }
470 new_end = normalize_up(end, bl->bl_blocksize);
471 if (end < new_end) {
472 if (is_range_written(be->be_inval, end, new_end))
473 end = new_end;
474 else
475 end = new_end - bl->bl_blocksize;
476 }
477 if (end <= new->bse_f_offset) {
478 kfree(new);
479 return 0;
480 }
481 new->bse_length = end - new->bse_f_offset;
482 new->bse_devid = be->be_devid;
483 new->bse_mdev = be->be_mdev;
484
485 spin_lock(&bl->bl_ext_lock);
486 /* new will be freed, either by add_to_commitlist if it decides not
487 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
488 */
489 add_to_commitlist(bl, new);
490 spin_unlock(&bl->bl_ext_lock);
491 return 0;
492}
493
494static void print_bl_extent(struct pnfs_block_extent *be)
495{
496 dprintk("PRINT EXTENT extent %p\n", be);
497 if (be) {
498 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
499 dprintk(" be_length %llu\n", (u64)be->be_length);
500 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
501 dprintk(" be_state %d\n", be->be_state);
502 }
503}
504
505static void
506destroy_extent(struct kref *kref)
507{
508 struct pnfs_block_extent *be;
509
510 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
511 dprintk("%s be=%p\n", __func__, be);
512 kfree(be);
513}
514
515void
516bl_put_extent(struct pnfs_block_extent *be)
517{
518 if (be) {
519 dprintk("%s enter %p (%i)\n", __func__, be,
520 atomic_read(&be->be_refcnt.refcount));
521 kref_put(&be->be_refcnt, destroy_extent);
522 }
523}
524
525struct pnfs_block_extent *bl_alloc_extent(void)
526{
527 struct pnfs_block_extent *be;
528
529 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
530 if (!be)
531 return NULL;
532 INIT_LIST_HEAD(&be->be_node);
533 kref_init(&be->be_refcnt);
534 be->be_inval = NULL;
535 return be;
536}
537
538static void print_elist(struct list_head *list)
539{
540 struct pnfs_block_extent *be;
541 dprintk("****************\n");
542 dprintk("Extent list looks like:\n");
543 list_for_each_entry(be, list, be_node) {
544 print_bl_extent(be);
545 }
546 dprintk("****************\n");
547}
548
549static inline int
550extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
551{
552 /* Note this assumes new->be_f_offset >= old->be_f_offset */
553 return (new->be_state == old->be_state) &&
554 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
555 ((new->be_v_offset - old->be_v_offset ==
556 new->be_f_offset - old->be_f_offset) &&
557 new->be_mdev == old->be_mdev));
558}
559
560/* Adds new to appropriate list in bl, modifying new and removing existing
561 * extents as appropriate to deal with overlaps.
562 *
563 * See bl_find_get_extent for list constraints.
564 *
565 * Refcount on new is already set. If end up not using it, or error out,
566 * need to put the reference.
567 *
568 * bl->bl_ext_lock is held by caller.
569 */
570int
571bl_add_merge_extent(struct pnfs_block_layout *bl,
572 struct pnfs_block_extent *new)
573{
574 struct pnfs_block_extent *be, *tmp;
575 sector_t end = new->be_f_offset + new->be_length;
576 struct list_head *list;
577
578 dprintk("%s enter with be=%p\n", __func__, new);
579 print_bl_extent(new);
580 list = &bl->bl_extents[bl_choose_list(new->be_state)];
581 print_elist(list);
582
583 /* Scan for proper place to insert, extending new to the left
584 * as much as possible.
585 */
586 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
587 if (new->be_f_offset >= be->be_f_offset + be->be_length)
588 break;
589 if (new->be_f_offset >= be->be_f_offset) {
590 if (end <= be->be_f_offset + be->be_length) {
591 /* new is a subset of existing be*/
592 if (extents_consistent(be, new)) {
593 dprintk("%s: new is subset, ignoring\n",
594 __func__);
595 bl_put_extent(new);
596 return 0;
597 } else {
598 goto out_err;
599 }
600 } else {
601 /* |<-- be -->|
602 * |<-- new -->| */
603 if (extents_consistent(be, new)) {
604 /* extend new to fully replace be */
605 new->be_length += new->be_f_offset -
606 be->be_f_offset;
607 new->be_f_offset = be->be_f_offset;
608 new->be_v_offset = be->be_v_offset;
609 dprintk("%s: removing %p\n", __func__, be);
610 list_del(&be->be_node);
611 bl_put_extent(be);
612 } else {
613 goto out_err;
614 }
615 }
616 } else if (end >= be->be_f_offset + be->be_length) {
617 /* new extent overlap existing be */
618 if (extents_consistent(be, new)) {
619 /* extend new to fully replace be */
620 dprintk("%s: removing %p\n", __func__, be);
621 list_del(&be->be_node);
622 bl_put_extent(be);
623 } else {
624 goto out_err;
625 }
626 } else if (end > be->be_f_offset) {
627 /* |<-- be -->|
628 *|<-- new -->| */
629 if (extents_consistent(new, be)) {
630 /* extend new to fully replace be */
631 new->be_length += be->be_f_offset + be->be_length -
632 new->be_f_offset - new->be_length;
633 dprintk("%s: removing %p\n", __func__, be);
634 list_del(&be->be_node);
635 bl_put_extent(be);
636 } else {
637 goto out_err;
638 }
639 }
640 }
641 /* Note that if we never hit the above break, be will not point to a
642 * valid extent. However, in that case &be->be_node==list.
643 */
644 list_add(&new->be_node, &be->be_node);
645 dprintk("%s: inserting new\n", __func__);
646 print_elist(list);
647 /* FIXME - The per-list consistency checks have all been done,
648 * should now check cross-list consistency.
649 */
650 return 0;
651
652 out_err:
653 bl_put_extent(new);
654 return -EIO;
655}
656
657/* Returns extent, or NULL. If a second READ extent exists, it is returned
658 * in cow_read, if given.
659 *
660 * The extents are kept in two seperate ordered lists, one for READ and NONE,
661 * one for READWRITE and INVALID. Within each list, we assume:
662 * 1. Extents are ordered by file offset.
663 * 2. For any given isect, there is at most one extents that matches.
664 */
665struct pnfs_block_extent *
666bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
667 struct pnfs_block_extent **cow_read)
668{
669 struct pnfs_block_extent *be, *cow, *ret;
670 int i;
671
672 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
673 cow = ret = NULL;
674 spin_lock(&bl->bl_ext_lock);
675 for (i = 0; i < EXTENT_LISTS; i++) {
676 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
677 if (isect >= be->be_f_offset + be->be_length)
678 break;
679 if (isect >= be->be_f_offset) {
680 /* We have found an extent */
681 dprintk("%s Get %p (%i)\n", __func__, be,
682 atomic_read(&be->be_refcnt.refcount));
683 kref_get(&be->be_refcnt);
684 if (!ret)
685 ret = be;
686 else if (be->be_state != PNFS_BLOCK_READ_DATA)
687 bl_put_extent(be);
688 else
689 cow = be;
690 break;
691 }
692 }
693 if (ret &&
694 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
695 break;
696 }
697 spin_unlock(&bl->bl_ext_lock);
698 if (cow_read)
699 *cow_read = cow;
700 print_bl_extent(ret);
701 return ret;
702}
703
704/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
705static struct pnfs_block_extent *
706bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
707{
708 struct pnfs_block_extent *be, *ret = NULL;
709 int i;
710
711 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
712 for (i = 0; i < EXTENT_LISTS; i++) {
713 if (ret)
714 break;
715 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
716 if (isect >= be->be_f_offset + be->be_length)
717 break;
718 if (isect >= be->be_f_offset) {
719 /* We have found an extent */
720 dprintk("%s Get %p (%i)\n", __func__, be,
721 atomic_read(&be->be_refcnt.refcount));
722 kref_get(&be->be_refcnt);
723 ret = be;
724 break;
725 }
726 }
727 }
728 print_bl_extent(ret);
729 return ret;
730}
731
732int
733encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
734 struct xdr_stream *xdr,
735 const struct nfs4_layoutcommit_args *arg)
736{
737 struct pnfs_block_short_extent *lce, *save;
738 unsigned int count = 0;
739 __be32 *p, *xdr_start;
740
741 dprintk("%s enter\n", __func__);
742 /* BUG - creation of bl_commit is buggy - need to wait for
743 * entire block to be marked WRITTEN before it can be added.
744 */
745 spin_lock(&bl->bl_ext_lock);
746 /* Want to adjust for possible truncate */
747 /* We now want to adjust argument range */
748
749 /* XDR encode the ranges found */
750 xdr_start = xdr_reserve_space(xdr, 8);
751 if (!xdr_start)
752 goto out;
753 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
754 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
755 if (!p)
756 break;
757 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
758 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
759 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
760 p = xdr_encode_hyper(p, 0LL);
761 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
762 list_del(&lce->bse_node);
763 list_add_tail(&lce->bse_node, &bl->bl_committing);
764 bl->bl_count--;
765 count++;
766 }
767 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
768 xdr_start[1] = cpu_to_be32(count);
769out:
770 spin_unlock(&bl->bl_ext_lock);
771 dprintk("%s found %i ranges\n", __func__, count);
772 return 0;
773}
774
775/* Helper function to set_to_rw that initialize a new extent */
776static void
777_prep_new_extent(struct pnfs_block_extent *new,
778 struct pnfs_block_extent *orig,
779 sector_t offset, sector_t length, int state)
780{
781 kref_init(&new->be_refcnt);
782 /* don't need to INIT_LIST_HEAD(&new->be_node) */
783 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
784 new->be_mdev = orig->be_mdev;
785 new->be_f_offset = offset;
786 new->be_length = length;
787 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
788 new->be_state = state;
789 new->be_inval = orig->be_inval;
790}
791
792/* Tries to merge be with extent in front of it in list.
793 * Frees storage if not used.
794 */
795static struct pnfs_block_extent *
796_front_merge(struct pnfs_block_extent *be, struct list_head *head,
797 struct pnfs_block_extent *storage)
798{
799 struct pnfs_block_extent *prev;
800
801 if (!storage)
802 goto no_merge;
803 if (&be->be_node == head || be->be_node.prev == head)
804 goto no_merge;
805 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
806 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
807 !extents_consistent(prev, be))
808 goto no_merge;
809 _prep_new_extent(storage, prev, prev->be_f_offset,
810 prev->be_length + be->be_length, prev->be_state);
811 list_replace(&prev->be_node, &storage->be_node);
812 bl_put_extent(prev);
813 list_del(&be->be_node);
814 bl_put_extent(be);
815 return storage;
816
817 no_merge:
818 kfree(storage);
819 return be;
820}
821
822static u64
823set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
824{
825 u64 rv = offset + length;
826 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
827 struct pnfs_block_extent *children[3];
828 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
829 int i = 0, j;
830
831 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
832 /* Create storage for up to three new extents e1, e2, e3 */
833 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
834 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
835 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
836 /* BUG - we are ignoring any failure */
837 if (!e1 || !e2 || !e3)
838 goto out_nosplit;
839
840 spin_lock(&bl->bl_ext_lock);
841 be = bl_find_get_extent_locked(bl, offset);
842 rv = be->be_f_offset + be->be_length;
843 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
844 spin_unlock(&bl->bl_ext_lock);
845 goto out_nosplit;
846 }
847 /* Add e* to children, bumping e*'s krefs */
848 if (be->be_f_offset != offset) {
849 _prep_new_extent(e1, be, be->be_f_offset,
850 offset - be->be_f_offset,
851 PNFS_BLOCK_INVALID_DATA);
852 children[i++] = e1;
853 print_bl_extent(e1);
854 } else
855 merge1 = e1;
856 _prep_new_extent(e2, be, offset,
857 min(length, be->be_f_offset + be->be_length - offset),
858 PNFS_BLOCK_READWRITE_DATA);
859 children[i++] = e2;
860 print_bl_extent(e2);
861 if (offset + length < be->be_f_offset + be->be_length) {
862 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
863 be->be_f_offset + be->be_length -
864 offset - length,
865 PNFS_BLOCK_INVALID_DATA);
866 children[i++] = e3;
867 print_bl_extent(e3);
868 } else
869 merge2 = e3;
870
871 /* Remove be from list, and insert the e* */
872 /* We don't get refs on e*, since this list is the base reference
873 * set when init'ed.
874 */
875 if (i < 3)
876 children[i] = NULL;
877 new = children[0];
878 list_replace(&be->be_node, &new->be_node);
879 bl_put_extent(be);
880 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
881 for (j = 1; j < i; j++) {
882 old = new;
883 new = children[j];
884 list_add(&new->be_node, &old->be_node);
885 }
886 if (merge2) {
887 /* This is a HACK, should just create a _back_merge function */
888 new = list_entry(new->be_node.next,
889 struct pnfs_block_extent, be_node);
890 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
891 }
892 spin_unlock(&bl->bl_ext_lock);
893
894 /* Since we removed the base reference above, be is now scheduled for
895 * destruction.
896 */
897 bl_put_extent(be);
898 dprintk("%s returns %llu after split\n", __func__, rv);
899 return rv;
900
901 out_nosplit:
902 kfree(e1);
903 kfree(e2);
904 kfree(e3);
905 dprintk("%s returns %llu without splitting\n", __func__, rv);
906 return rv;
907}
908
909void
910clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
911 const struct nfs4_layoutcommit_args *arg,
912 int status)
913{
914 struct pnfs_block_short_extent *lce, *save;
915
916 dprintk("%s status %d\n", __func__, status);
917 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
918 if (likely(!status)) {
919 u64 offset = lce->bse_f_offset;
920 u64 end = offset + lce->bse_length;
921
922 do {
923 offset = set_to_rw(bl, offset, end - offset);
924 } while (offset < end);
925 list_del(&lce->bse_node);
926
927 kfree(lce);
928 } else {
929 list_del(&lce->bse_node);
930 spin_lock(&bl->bl_ext_lock);
931 add_to_commitlist(bl, lce);
932 spin_unlock(&bl->bl_ext_lock);
933 }
934 }
935}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 19ea7d9c75e6..5833fbbf59b0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = {
105 .nrvers = ARRAY_SIZE(nfs_version), 105 .nrvers = ARRAY_SIZE(nfs_version),
106 .version = nfs_version, 106 .version = nfs_version,
107 .stats = &nfs_rpcstat, 107 .stats = &nfs_rpcstat,
108 .pipe_dir_name = "/nfs", 108 .pipe_dir_name = NFS_PIPE_DIRNAME,
109}; 109};
110 110
111struct rpc_stat nfs_rpcstat = { 111struct rpc_stat nfs_rpcstat = {
@@ -904,7 +904,9 @@ error:
904/* 904/*
905 * Load up the server record from information gained in an fsinfo record 905 * Load up the server record from information gained in an fsinfo record
906 */ 906 */
907static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) 907static void nfs_server_set_fsinfo(struct nfs_server *server,
908 struct nfs_fh *mntfh,
909 struct nfs_fsinfo *fsinfo)
908{ 910{
909 unsigned long max_rpc_payload; 911 unsigned long max_rpc_payload;
910 912
@@ -934,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
934 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 936 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
935 server->wsize = NFS_MAX_FILE_IO_SIZE; 937 server->wsize = NFS_MAX_FILE_IO_SIZE;
936 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 938 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
937 set_pnfs_layoutdriver(server, fsinfo->layouttype); 939 server->pnfs_blksize = fsinfo->blksize;
940 set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
938 941
939 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 942 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
940 943
@@ -980,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
980 if (error < 0) 983 if (error < 0)
981 goto out_error; 984 goto out_error;
982 985
983 nfs_server_set_fsinfo(server, &fsinfo); 986 nfs_server_set_fsinfo(server, mntfh, &fsinfo);
984 987
985 /* Get some general file system info */ 988 /* Get some general file system info */
986 if (server->namelen == 0) { 989 if (server->namelen == 0) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57f578e2560a..b238d95ac48c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = {
134 134
135#endif /* CONFIG_NFS_V4 */ 135#endif /* CONFIG_NFS_V4 */
136 136
137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) 137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
138{ 138{
139 struct nfs_open_dir_context *ctx; 139 struct nfs_open_dir_context *ctx;
140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
141 if (ctx != NULL) { 141 if (ctx != NULL) {
142 ctx->duped = 0; 142 ctx->duped = 0;
143 ctx->attr_gencount = NFS_I(dir)->attr_gencount;
143 ctx->dir_cookie = 0; 144 ctx->dir_cookie = 0;
144 ctx->dup_cookie = 0; 145 ctx->dup_cookie = 0;
145 ctx->cred = get_rpccred(cred); 146 ctx->cred = get_rpccred(cred);
146 } else 147 return ctx;
147 ctx = ERR_PTR(-ENOMEM); 148 }
148 return ctx; 149 return ERR_PTR(-ENOMEM);
149} 150}
150 151
151static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) 152static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
@@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
173 cred = rpc_lookup_cred(); 174 cred = rpc_lookup_cred();
174 if (IS_ERR(cred)) 175 if (IS_ERR(cred))
175 return PTR_ERR(cred); 176 return PTR_ERR(cred);
176 ctx = alloc_nfs_open_dir_context(cred); 177 ctx = alloc_nfs_open_dir_context(inode, cred);
177 if (IS_ERR(ctx)) { 178 if (IS_ERR(ctx)) {
178 res = PTR_ERR(ctx); 179 res = PTR_ERR(ctx);
179 goto out; 180 goto out;
@@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
323{ 324{
324 loff_t diff = desc->file->f_pos - desc->current_index; 325 loff_t diff = desc->file->f_pos - desc->current_index;
325 unsigned int index; 326 unsigned int index;
326 struct nfs_open_dir_context *ctx = desc->file->private_data;
327 327
328 if (diff < 0) 328 if (diff < 0)
329 goto out_eof; 329 goto out_eof;
@@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
336 index = (unsigned int)diff; 336 index = (unsigned int)diff;
337 *desc->dir_cookie = array->array[index].cookie; 337 *desc->dir_cookie = array->array[index].cookie;
338 desc->cache_entry_index = index; 338 desc->cache_entry_index = index;
339 ctx->duped = 0;
340 return 0; 339 return 0;
341out_eof: 340out_eof:
342 desc->eof = 1; 341 desc->eof = 1;
@@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
349 int i; 348 int i;
350 loff_t new_pos; 349 loff_t new_pos;
351 int status = -EAGAIN; 350 int status = -EAGAIN;
352 struct nfs_open_dir_context *ctx = desc->file->private_data;
353 351
354 for (i = 0; i < array->size; i++) { 352 for (i = 0; i < array->size; i++) {
355 if (array->array[i].cookie == *desc->dir_cookie) { 353 if (array->array[i].cookie == *desc->dir_cookie) {
354 struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
355 struct nfs_open_dir_context *ctx = desc->file->private_data;
356
356 new_pos = desc->current_index + i; 357 new_pos = desc->current_index + i;
357 if (new_pos < desc->file->f_pos) { 358 if (ctx->attr_gencount != nfsi->attr_gencount
359 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
360 ctx->duped = 0;
361 ctx->attr_gencount = nfsi->attr_gencount;
362 } else if (new_pos < desc->file->f_pos) {
363 if (ctx->duped > 0
364 && ctx->dup_cookie == *desc->dir_cookie) {
365 if (printk_ratelimit()) {
366 pr_notice("NFS: directory %s/%s contains a readdir loop."
367 "Please contact your server vendor. "
368 "The file: %s has duplicate cookie %llu\n",
369 desc->file->f_dentry->d_parent->d_name.name,
370 desc->file->f_dentry->d_name.name,
371 array->array[i].string.name,
372 *desc->dir_cookie);
373 }
374 status = -ELOOP;
375 goto out;
376 }
358 ctx->dup_cookie = *desc->dir_cookie; 377 ctx->dup_cookie = *desc->dir_cookie;
359 ctx->duped = 1; 378 ctx->duped = -1;
360 } 379 }
361 desc->file->f_pos = new_pos; 380 desc->file->f_pos = new_pos;
362 desc->cache_entry_index = i; 381 desc->cache_entry_index = i;
@@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
368 if (*desc->dir_cookie == array->last_cookie) 387 if (*desc->dir_cookie == array->last_cookie)
369 desc->eof = 1; 388 desc->eof = 1;
370 } 389 }
390out:
371 return status; 391 return status;
372} 392}
373 393
@@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
740 struct nfs_cache_array *array = NULL; 760 struct nfs_cache_array *array = NULL;
741 struct nfs_open_dir_context *ctx = file->private_data; 761 struct nfs_open_dir_context *ctx = file->private_data;
742 762
743 if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
744 if (printk_ratelimit()) {
745 pr_notice("NFS: directory %s/%s contains a readdir loop. "
746 "Please contact your server vendor. "
747 "Offending cookie: %llu\n",
748 file->f_dentry->d_parent->d_name.name,
749 file->f_dentry->d_name.name,
750 *desc->dir_cookie);
751 }
752 res = -ELOOP;
753 goto out;
754 }
755
756 array = nfs_readdir_get_array(desc->page); 763 array = nfs_readdir_get_array(desc->page);
757 if (IS_ERR(array)) { 764 if (IS_ERR(array)) {
758 res = PTR_ERR(array); 765 res = PTR_ERR(array);
@@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
774 *desc->dir_cookie = array->array[i+1].cookie; 781 *desc->dir_cookie = array->array[i+1].cookie;
775 else 782 else
776 *desc->dir_cookie = array->last_cookie; 783 *desc->dir_cookie = array->last_cookie;
784 if (ctx->duped != 0)
785 ctx->duped = 1;
777 } 786 }
778 if (array->eof_index >= 0) 787 if (array->eof_index >= 0)
779 desc->eof = 1; 788 desc->eof = 1;
@@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
805 struct page *page = NULL; 814 struct page *page = NULL;
806 int status; 815 int status;
807 struct inode *inode = desc->file->f_path.dentry->d_inode; 816 struct inode *inode = desc->file->f_path.dentry->d_inode;
817 struct nfs_open_dir_context *ctx = desc->file->private_data;
808 818
809 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 819 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
810 (unsigned long long)*desc->dir_cookie); 820 (unsigned long long)*desc->dir_cookie);
@@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
818 desc->page_index = 0; 828 desc->page_index = 0;
819 desc->last_cookie = *desc->dir_cookie; 829 desc->last_cookie = *desc->dir_cookie;
820 desc->page = page; 830 desc->page = page;
831 ctx->duped = 0;
821 832
822 status = nfs_readdir_xdr_to_array(desc, page, inode); 833 status = nfs_readdir_xdr_to_array(desc, page, inode);
823 if (status < 0) 834 if (status < 0)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1909ee8be350..1ec1a85fa71c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -318,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
318extern const u32 nfs4_fattr_bitmap[2]; 318extern const u32 nfs4_fattr_bitmap[2];
319extern const u32 nfs4_statfs_bitmap[2]; 319extern const u32 nfs4_statfs_bitmap[2];
320extern const u32 nfs4_pathconf_bitmap[2]; 320extern const u32 nfs4_pathconf_bitmap[2];
321extern const u32 nfs4_fsinfo_bitmap[2]; 321extern const u32 nfs4_fsinfo_bitmap[3];
322extern const u32 nfs4_fs_locations_bitmap[2]; 322extern const u32 nfs4_fs_locations_bitmap[2];
323 323
324/* nfs4renewd.c */ 324/* nfs4renewd.c */
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be93a622872c..e8915d4840ad 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
170 170
171 pnfs_set_layoutcommit(wdata); 171 pnfs_set_layoutcommit(wdata);
172 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, 172 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
173 (unsigned long) wdata->lseg->pls_end_pos); 173 (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
174} 174}
175 175
176/* 176/*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 079614deca3f..8c77039e7a81 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -140,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
140 0 140 0
141}; 141};
142 142
143const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE 143const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
144 | FATTR4_WORD0_MAXREAD 144 | FATTR4_WORD0_MAXREAD
145 | FATTR4_WORD0_MAXWRITE 145 | FATTR4_WORD0_MAXWRITE
146 | FATTR4_WORD0_LEASE_TIME, 146 | FATTR4_WORD0_LEASE_TIME,
147 FATTR4_WORD1_TIME_DELTA 147 FATTR4_WORD1_TIME_DELTA
148 | FATTR4_WORD1_FS_LAYOUT_TYPES 148 | FATTR4_WORD1_FS_LAYOUT_TYPES,
149 FATTR4_WORD2_LAYOUT_BLKSIZE
149}; 150};
150 151
151const u32 nfs4_fs_locations_bitmap[2] = { 152const u32 nfs4_fs_locations_bitmap[2] = {
@@ -5834,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
5834 return status; 5835 return status;
5835} 5836}
5836 5837
5838/*
5839 * Retrieve the list of Data Server devices from the MDS.
5840 */
5841static int _nfs4_getdevicelist(struct nfs_server *server,
5842 const struct nfs_fh *fh,
5843 struct pnfs_devicelist *devlist)
5844{
5845 struct nfs4_getdevicelist_args args = {
5846 .fh = fh,
5847 .layoutclass = server->pnfs_curr_ld->id,
5848 };
5849 struct nfs4_getdevicelist_res res = {
5850 .devlist = devlist,
5851 };
5852 struct rpc_message msg = {
5853 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
5854 .rpc_argp = &args,
5855 .rpc_resp = &res,
5856 };
5857 int status;
5858
5859 dprintk("--> %s\n", __func__);
5860 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
5861 &res.seq_res, 0);
5862 dprintk("<-- %s status=%d\n", __func__, status);
5863 return status;
5864}
5865
5866int nfs4_proc_getdevicelist(struct nfs_server *server,
5867 const struct nfs_fh *fh,
5868 struct pnfs_devicelist *devlist)
5869{
5870 struct nfs4_exception exception = { };
5871 int err;
5872
5873 do {
5874 err = nfs4_handle_exception(server,
5875 _nfs4_getdevicelist(server, fh, devlist),
5876 &exception);
5877 } while (exception.retry);
5878
5879 dprintk("%s: err=%d, num_devs=%u\n", __func__,
5880 err, devlist->num_devs);
5881
5882 return err;
5883}
5884EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
5885
5837static int 5886static int
5838_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 5887_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5839{ 5888{
@@ -5912,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
5912static void nfs4_layoutcommit_release(void *calldata) 5961static void nfs4_layoutcommit_release(void *calldata)
5913{ 5962{
5914 struct nfs4_layoutcommit_data *data = calldata; 5963 struct nfs4_layoutcommit_data *data = calldata;
5964 struct pnfs_layout_segment *lseg, *tmp;
5915 5965
5966 pnfs_cleanup_layoutcommit(data);
5916 /* Matched by references in pnfs_set_layoutcommit */ 5967 /* Matched by references in pnfs_set_layoutcommit */
5917 put_lseg(data->lseg); 5968 list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
5969 list_del_init(&lseg->pls_lc_list);
5970 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
5971 &lseg->pls_flags))
5972 put_lseg(lseg);
5973 }
5918 put_rpccred(data->cred); 5974 put_rpccred(data->cred);
5919 kfree(data); 5975 kfree(data);
5920} 5976}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c191a9baa422..1dce12f41a4f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
113#define encode_restorefh_maxsz (op_encode_hdr_maxsz) 113#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
114#define decode_restorefh_maxsz (op_decode_hdr_maxsz) 114#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
115#define encode_fsinfo_maxsz (encode_getattr_maxsz) 115#define encode_fsinfo_maxsz (encode_getattr_maxsz)
116#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) 116/* The 5 accounts for the PNFS attributes, and assumes that at most three
117 * layout types will be returned.
118 */
119#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
120 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
117#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) 121#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
118#define decode_renew_maxsz (op_decode_hdr_maxsz) 122#define decode_renew_maxsz (op_decode_hdr_maxsz)
119#define encode_setclientid_maxsz \ 123#define encode_setclientid_maxsz \
@@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int);
314 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 318 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
315#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 319#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
316#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 320#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
321#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
322 encode_verifier_maxsz)
323#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
324 2 /* nfs_cookie4 gdlr_cookie */ + \
325 decode_verifier_maxsz \
326 /* verifier4 gdlr_verifier */ + \
327 1 /* gdlr_deviceid_list count */ + \
328 XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
329 NFS4_DEVICEID4_SIZE) \
330 /* gdlr_deviceid_list */ + \
331 1 /* bool gdlr_eof */)
317#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ 332#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
318 XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) 333 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
319#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 334#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -748,6 +763,14 @@ static int nfs4_stat_to_errno(int);
748#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 763#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
749 decode_sequence_maxsz + \ 764 decode_sequence_maxsz + \
750 decode_reclaim_complete_maxsz) 765 decode_reclaim_complete_maxsz)
766#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
767 encode_sequence_maxsz + \
768 encode_putfh_maxsz + \
769 encode_getdevicelist_maxsz)
770#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
771 decode_sequence_maxsz + \
772 decode_putfh_maxsz + \
773 decode_getdevicelist_maxsz)
751#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 774#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
752 encode_sequence_maxsz +\ 775 encode_sequence_maxsz +\
753 encode_getdeviceinfo_maxsz) 776 encode_getdeviceinfo_maxsz)
@@ -1104,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
1104 hdr->replen += decode_getattr_maxsz; 1127 hdr->replen += decode_getattr_maxsz;
1105} 1128}
1106 1129
1130static void
1131encode_getattr_three(struct xdr_stream *xdr,
1132 uint32_t bm0, uint32_t bm1, uint32_t bm2,
1133 struct compound_hdr *hdr)
1134{
1135 __be32 *p;
1136
1137 p = reserve_space(xdr, 4);
1138 *p = cpu_to_be32(OP_GETATTR);
1139 if (bm2) {
1140 p = reserve_space(xdr, 16);
1141 *p++ = cpu_to_be32(3);
1142 *p++ = cpu_to_be32(bm0);
1143 *p++ = cpu_to_be32(bm1);
1144 *p = cpu_to_be32(bm2);
1145 } else if (bm1) {
1146 p = reserve_space(xdr, 12);
1147 *p++ = cpu_to_be32(2);
1148 *p++ = cpu_to_be32(bm0);
1149 *p = cpu_to_be32(bm1);
1150 } else {
1151 p = reserve_space(xdr, 8);
1152 *p++ = cpu_to_be32(1);
1153 *p = cpu_to_be32(bm0);
1154 }
1155 hdr->nops++;
1156 hdr->replen += decode_getattr_maxsz;
1157}
1158
1107static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1159static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1108{ 1160{
1109 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], 1161 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1112,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
1112 1164
1113static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1165static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1114{ 1166{
1115 encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], 1167 encode_getattr_three(xdr,
1116 bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); 1168 bitmask[0] & nfs4_fsinfo_bitmap[0],
1169 bitmask[1] & nfs4_fsinfo_bitmap[1],
1170 bitmask[2] & nfs4_fsinfo_bitmap[2],
1171 hdr);
1117} 1172}
1118 1173
1119static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1174static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1855,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr,
1855 1910
1856#ifdef CONFIG_NFS_V4_1 1911#ifdef CONFIG_NFS_V4_1
1857static void 1912static void
1913encode_getdevicelist(struct xdr_stream *xdr,
1914 const struct nfs4_getdevicelist_args *args,
1915 struct compound_hdr *hdr)
1916{
1917 __be32 *p;
1918 nfs4_verifier dummy = {
1919 .data = "dummmmmy",
1920 };
1921
1922 p = reserve_space(xdr, 20);
1923 *p++ = cpu_to_be32(OP_GETDEVICELIST);
1924 *p++ = cpu_to_be32(args->layoutclass);
1925 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1926 xdr_encode_hyper(p, 0ULL); /* cookie */
1927 encode_nfs4_verifier(xdr, &dummy);
1928 hdr->nops++;
1929 hdr->replen += decode_getdevicelist_maxsz;
1930}
1931
1932static void
1858encode_getdeviceinfo(struct xdr_stream *xdr, 1933encode_getdeviceinfo(struct xdr_stream *xdr,
1859 const struct nfs4_getdeviceinfo_args *args, 1934 const struct nfs4_getdeviceinfo_args *args,
1860 struct compound_hdr *hdr) 1935 struct compound_hdr *hdr)
@@ -1916,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
1916 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1991 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1917 /* Only whole file layouts */ 1992 /* Only whole file layouts */
1918 p = xdr_encode_hyper(p, 0); /* offset */ 1993 p = xdr_encode_hyper(p, 0); /* offset */
1919 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ 1994 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
1920 *p++ = cpu_to_be32(0); /* reclaim */ 1995 *p++ = cpu_to_be32(0); /* reclaim */
1921 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); 1996 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
1922 *p++ = cpu_to_be32(1); /* newoffset = TRUE */ 1997 *p++ = cpu_to_be32(1); /* newoffset = TRUE */
@@ -2604,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
2604 struct compound_hdr hdr = { 2679 struct compound_hdr hdr = {
2605 .nops = 0, 2680 .nops = 0,
2606 }; 2681 };
2607 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2682 const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
2608 2683
2609 encode_compound_hdr(xdr, req, &hdr); 2684 encode_compound_hdr(xdr, req, &hdr);
2610 encode_setclientid_confirm(xdr, arg, &hdr); 2685 encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2748,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
2748 struct compound_hdr hdr = { 2823 struct compound_hdr hdr = {
2749 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), 2824 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
2750 }; 2825 };
2751 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2826 const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
2752 2827
2753 encode_compound_hdr(xdr, req, &hdr); 2828 encode_compound_hdr(xdr, req, &hdr);
2754 encode_sequence(xdr, &args->la_seq_args, &hdr); 2829 encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -2775,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2775} 2850}
2776 2851
2777/* 2852/*
2853 * Encode GETDEVICELIST request
2854 */
2855static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
2856 struct xdr_stream *xdr,
2857 struct nfs4_getdevicelist_args *args)
2858{
2859 struct compound_hdr hdr = {
2860 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2861 };
2862
2863 encode_compound_hdr(xdr, req, &hdr);
2864 encode_sequence(xdr, &args->seq_args, &hdr);
2865 encode_putfh(xdr, args->fh, &hdr);
2866 encode_getdevicelist(xdr, args, &hdr);
2867 encode_nops(&hdr);
2868}
2869
2870/*
2778 * Encode GETDEVICEINFO request 2871 * Encode GETDEVICEINFO request
2779 */ 2872 */
2780static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, 2873static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -3011,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
3011 goto out_overflow; 3104 goto out_overflow;
3012 bmlen = be32_to_cpup(p); 3105 bmlen = be32_to_cpup(p);
3013 3106
3014 bitmap[0] = bitmap[1] = 0; 3107 bitmap[0] = bitmap[1] = bitmap[2] = 0;
3015 p = xdr_inline_decode(xdr, (bmlen << 2)); 3108 p = xdr_inline_decode(xdr, (bmlen << 2));
3016 if (unlikely(!p)) 3109 if (unlikely(!p))
3017 goto out_overflow; 3110 goto out_overflow;
3018 if (bmlen > 0) { 3111 if (bmlen > 0) {
3019 bitmap[0] = be32_to_cpup(p++); 3112 bitmap[0] = be32_to_cpup(p++);
3020 if (bmlen > 1) 3113 if (bmlen > 1) {
3021 bitmap[1] = be32_to_cpup(p); 3114 bitmap[1] = be32_to_cpup(p++);
3115 if (bmlen > 2)
3116 bitmap[2] = be32_to_cpup(p);
3117 }
3022 } 3118 }
3023 return 0; 3119 return 0;
3024out_overflow: 3120out_overflow:
@@ -3050,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
3050 return ret; 3146 return ret;
3051 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 3147 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
3052 } else 3148 } else
3053 bitmask[0] = bitmask[1] = 0; 3149 bitmask[0] = bitmask[1] = bitmask[2] = 0;
3054 dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); 3150 dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
3151 bitmask[0], bitmask[1], bitmask[2]);
3055 return 0; 3152 return 0;
3056} 3153}
3057 3154
@@ -4105,7 +4202,7 @@ out_overflow:
4105static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 4202static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
4106{ 4203{
4107 __be32 *savep; 4204 __be32 *savep;
4108 uint32_t attrlen, bitmap[2] = {0}; 4205 uint32_t attrlen, bitmap[3] = {0};
4109 int status; 4206 int status;
4110 4207
4111 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4208 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4131,7 +4228,7 @@ xdr_error:
4131static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) 4228static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
4132{ 4229{
4133 __be32 *savep; 4230 __be32 *savep;
4134 uint32_t attrlen, bitmap[2] = {0}; 4231 uint32_t attrlen, bitmap[3] = {0};
4135 int status; 4232 int status;
4136 4233
4137 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4234 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4163,7 +4260,7 @@ xdr_error:
4163static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) 4260static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
4164{ 4261{
4165 __be32 *savep; 4262 __be32 *savep;
4166 uint32_t attrlen, bitmap[2] = {0}; 4263 uint32_t attrlen, bitmap[3] = {0};
4167 int status; 4264 int status;
4168 4265
4169 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4266 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4303,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4303{ 4400{
4304 __be32 *savep; 4401 __be32 *savep;
4305 uint32_t attrlen, 4402 uint32_t attrlen,
4306 bitmap[2] = {0}; 4403 bitmap[3] = {0};
4307 int status; 4404 int status;
4308 4405
4309 status = decode_op_hdr(xdr, OP_GETATTR); 4406 status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4389,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
4389 return status; 4486 return status;
4390} 4487}
4391 4488
4489/*
4490 * The prefered block size for layout directed io
4491 */
4492static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
4493 uint32_t *res)
4494{
4495 __be32 *p;
4496
4497 dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
4498 *res = 0;
4499 if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
4500 p = xdr_inline_decode(xdr, 4);
4501 if (unlikely(!p)) {
4502 print_overflow_msg(__func__, xdr);
4503 return -EIO;
4504 }
4505 *res = be32_to_cpup(p);
4506 bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
4507 }
4508 return 0;
4509}
4510
4392static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4511static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
4393{ 4512{
4394 __be32 *savep; 4513 __be32 *savep;
4395 uint32_t attrlen, bitmap[2]; 4514 uint32_t attrlen, bitmap[3];
4396 int status; 4515 int status;
4397 4516
4398 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4517 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4420,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
4420 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); 4539 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
4421 if (status != 0) 4540 if (status != 0)
4422 goto xdr_error; 4541 goto xdr_error;
4542 status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
4543 if (status)
4544 goto xdr_error;
4423 4545
4424 status = verify_attr_len(xdr, savep, attrlen); 4546 status = verify_attr_len(xdr, savep, attrlen);
4425xdr_error: 4547xdr_error:
@@ -4839,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
4839{ 4961{
4840 __be32 *savep; 4962 __be32 *savep;
4841 uint32_t attrlen, 4963 uint32_t attrlen,
4842 bitmap[2] = {0}; 4964 bitmap[3] = {0};
4843 struct kvec *iov = req->rq_rcv_buf.head; 4965 struct kvec *iov = req->rq_rcv_buf.head;
4844 int status; 4966 int status;
4845 4967
@@ -5268,6 +5390,53 @@ out_overflow:
5268} 5390}
5269 5391
5270#if defined(CONFIG_NFS_V4_1) 5392#if defined(CONFIG_NFS_V4_1)
5393/*
5394 * TODO: Need to handle case when EOF != true;
5395 */
5396static int decode_getdevicelist(struct xdr_stream *xdr,
5397 struct pnfs_devicelist *res)
5398{
5399 __be32 *p;
5400 int status, i;
5401 struct nfs_writeverf verftemp;
5402
5403 status = decode_op_hdr(xdr, OP_GETDEVICELIST);
5404 if (status)
5405 return status;
5406
5407 p = xdr_inline_decode(xdr, 8 + 8 + 4);
5408 if (unlikely(!p))
5409 goto out_overflow;
5410
5411 /* TODO: Skip cookie for now */
5412 p += 2;
5413
5414 /* Read verifier */
5415 p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
5416
5417 res->num_devs = be32_to_cpup(p);
5418
5419 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5420
5421 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5422 printk(KERN_ERR "%s too many result dev_num %u\n",
5423 __func__, res->num_devs);
5424 return -EIO;
5425 }
5426
5427 p = xdr_inline_decode(xdr,
5428 res->num_devs * NFS4_DEVICEID4_SIZE + 4);
5429 if (unlikely(!p))
5430 goto out_overflow;
5431 for (i = 0; i < res->num_devs; i++)
5432 p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
5433 NFS4_DEVICEID4_SIZE);
5434 res->eof = be32_to_cpup(p);
5435 return 0;
5436out_overflow:
5437 print_overflow_msg(__func__, xdr);
5438 return -EIO;
5439}
5271 5440
5272static int decode_getdeviceinfo(struct xdr_stream *xdr, 5441static int decode_getdeviceinfo(struct xdr_stream *xdr,
5273 struct pnfs_device *pdev) 5442 struct pnfs_device *pdev)
@@ -5430,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
5430 int status; 5599 int status;
5431 5600
5432 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); 5601 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
5602 res->status = status;
5433 if (status) 5603 if (status)
5434 return status; 5604 return status;
5435 5605
@@ -6542,6 +6712,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
6542} 6712}
6543 6713
6544/* 6714/*
6715 * Decode GETDEVICELIST response
6716 */
6717static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
6718 struct xdr_stream *xdr,
6719 struct nfs4_getdevicelist_res *res)
6720{
6721 struct compound_hdr hdr;
6722 int status;
6723
6724 dprintk("encoding getdevicelist!\n");
6725
6726 status = decode_compound_hdr(xdr, &hdr);
6727 if (status != 0)
6728 goto out;
6729 status = decode_sequence(xdr, &res->seq_res, rqstp);
6730 if (status != 0)
6731 goto out;
6732 status = decode_putfh(xdr);
6733 if (status != 0)
6734 goto out;
6735 status = decode_getdevicelist(xdr, res->devlist);
6736out:
6737 return status;
6738}
6739
6740/*
6545 * Decode GETDEVINFO response 6741 * Decode GETDEVINFO response
6546 */ 6742 */
6547static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, 6743static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -6722,7 +6918,7 @@ out:
6722int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, 6918int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6723 int plus) 6919 int plus)
6724{ 6920{
6725 uint32_t bitmap[2] = {0}; 6921 uint32_t bitmap[3] = {0};
6726 uint32_t len; 6922 uint32_t len;
6727 __be32 *p = xdr_inline_decode(xdr, 4); 6923 __be32 *p = xdr_inline_decode(xdr, 4);
6728 if (unlikely(!p)) 6924 if (unlikely(!p))
@@ -6908,6 +7104,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6908 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7104 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
6909 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7105 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
6910 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), 7106 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7107 PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
6911#endif /* CONFIG_NFS_V4_1 */ 7108#endif /* CONFIG_NFS_V4_1 */
6912}; 7109};
6913 7110
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 38e5508555c6..e550e8836c37 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -76,8 +76,11 @@ find_pnfs_driver(u32 id)
76void 76void
77unset_pnfs_layoutdriver(struct nfs_server *nfss) 77unset_pnfs_layoutdriver(struct nfs_server *nfss)
78{ 78{
79 if (nfss->pnfs_curr_ld) 79 if (nfss->pnfs_curr_ld) {
80 if (nfss->pnfs_curr_ld->clear_layoutdriver)
81 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
80 module_put(nfss->pnfs_curr_ld->owner); 82 module_put(nfss->pnfs_curr_ld->owner);
83 }
81 nfss->pnfs_curr_ld = NULL; 84 nfss->pnfs_curr_ld = NULL;
82} 85}
83 86
@@ -88,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
88 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 91 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
89 */ 92 */
90void 93void
91set_pnfs_layoutdriver(struct nfs_server *server, u32 id) 94set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
95 u32 id)
92{ 96{
93 struct pnfs_layoutdriver_type *ld_type = NULL; 97 struct pnfs_layoutdriver_type *ld_type = NULL;
94 98
@@ -115,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
115 goto out_no_driver; 119 goto out_no_driver;
116 } 120 }
117 server->pnfs_curr_ld = ld_type; 121 server->pnfs_curr_ld = ld_type;
122 if (ld_type->set_layoutdriver
123 && ld_type->set_layoutdriver(server, mntfh)) {
124 printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
125 __func__, id);
126 module_put(ld_type->owner);
127 goto out_no_driver;
128 }
118 129
119 dprintk("%s: pNFS module for %u set\n", __func__, id); 130 dprintk("%s: pNFS module for %u set\n", __func__, id);
120 return; 131 return;
@@ -190,6 +201,7 @@ static void
190pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 201pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
191{ 202{
192 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; 203 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
204 put_rpccred(lo->plh_lc_cred);
193 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); 205 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
194} 206}
195 207
@@ -224,6 +236,7 @@ static void
224init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 236init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
225{ 237{
226 INIT_LIST_HEAD(&lseg->pls_list); 238 INIT_LIST_HEAD(&lseg->pls_list);
239 INIT_LIST_HEAD(&lseg->pls_lc_list);
227 atomic_set(&lseg->pls_refcount, 1); 240 atomic_set(&lseg->pls_refcount, 1);
228 smp_mb(); 241 smp_mb();
229 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 242 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
@@ -816,7 +829,9 @@ out:
816} 829}
817 830
818static struct pnfs_layout_hdr * 831static struct pnfs_layout_hdr *
819alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) 832alloc_init_layout_hdr(struct inode *ino,
833 struct nfs_open_context *ctx,
834 gfp_t gfp_flags)
820{ 835{
821 struct pnfs_layout_hdr *lo; 836 struct pnfs_layout_hdr *lo;
822 837
@@ -828,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
828 INIT_LIST_HEAD(&lo->plh_segs); 843 INIT_LIST_HEAD(&lo->plh_segs);
829 INIT_LIST_HEAD(&lo->plh_bulk_recall); 844 INIT_LIST_HEAD(&lo->plh_bulk_recall);
830 lo->plh_inode = ino; 845 lo->plh_inode = ino;
846 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
831 return lo; 847 return lo;
832} 848}
833 849
834static struct pnfs_layout_hdr * 850static struct pnfs_layout_hdr *
835pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) 851pnfs_find_alloc_layout(struct inode *ino,
852 struct nfs_open_context *ctx,
853 gfp_t gfp_flags)
836{ 854{
837 struct nfs_inode *nfsi = NFS_I(ino); 855 struct nfs_inode *nfsi = NFS_I(ino);
838 struct pnfs_layout_hdr *new = NULL; 856 struct pnfs_layout_hdr *new = NULL;
@@ -847,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
847 return nfsi->layout; 865 return nfsi->layout;
848 } 866 }
849 spin_unlock(&ino->i_lock); 867 spin_unlock(&ino->i_lock);
850 new = alloc_init_layout_hdr(ino, gfp_flags); 868 new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
851 spin_lock(&ino->i_lock); 869 spin_lock(&ino->i_lock);
852 870
853 if (likely(nfsi->layout == NULL)) /* Won the race? */ 871 if (likely(nfsi->layout == NULL)) /* Won the race? */
@@ -940,7 +958,7 @@ pnfs_update_layout(struct inode *ino,
940 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 958 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
941 return NULL; 959 return NULL;
942 spin_lock(&ino->i_lock); 960 spin_lock(&ino->i_lock);
943 lo = pnfs_find_alloc_layout(ino, gfp_flags); 961 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
944 if (lo == NULL) { 962 if (lo == NULL) {
945 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 963 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
946 goto out_unlock; 964 goto out_unlock;
@@ -1350,16 +1368,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1350EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1368EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
1351 1369
1352/* 1370/*
1353 * Currently there is only one (whole file) write lseg. 1371 * There can be multiple RW segments.
1354 */ 1372 */
1355static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) 1373static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1356{ 1374{
1357 struct pnfs_layout_segment *lseg, *rv = NULL; 1375 struct pnfs_layout_segment *lseg;
1358 1376
1359 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) 1377 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
1360 if (lseg->pls_range.iomode == IOMODE_RW) 1378 if (lseg->pls_range.iomode == IOMODE_RW &&
1361 rv = lseg; 1379 test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1362 return rv; 1380 list_add(&lseg->pls_lc_list, listp);
1381 }
1363} 1382}
1364 1383
1365void 1384void
@@ -1371,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1371 1390
1372 spin_lock(&nfsi->vfs_inode.i_lock); 1391 spin_lock(&nfsi->vfs_inode.i_lock);
1373 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1392 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1374 /* references matched in nfs4_layoutcommit_release */
1375 get_lseg(wdata->lseg);
1376 wdata->lseg->pls_lc_cred =
1377 get_rpccred(wdata->args.context->state->owner->so_cred);
1378 mark_as_dirty = true; 1393 mark_as_dirty = true;
1379 dprintk("%s: Set layoutcommit for inode %lu ", 1394 dprintk("%s: Set layoutcommit for inode %lu ",
1380 __func__, wdata->inode->i_ino); 1395 __func__, wdata->inode->i_ino);
1381 } 1396 }
1382 if (end_pos > wdata->lseg->pls_end_pos) 1397 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
1383 wdata->lseg->pls_end_pos = end_pos; 1398 /* references matched in nfs4_layoutcommit_release */
1399 get_lseg(wdata->lseg);
1400 }
1401 if (end_pos > nfsi->layout->plh_lwb)
1402 nfsi->layout->plh_lwb = end_pos;
1384 spin_unlock(&nfsi->vfs_inode.i_lock); 1403 spin_unlock(&nfsi->vfs_inode.i_lock);
1404 dprintk("%s: lseg %p end_pos %llu\n",
1405 __func__, wdata->lseg, nfsi->layout->plh_lwb);
1385 1406
1386 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1407 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1387 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 1408 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
@@ -1390,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1390} 1411}
1391EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1412EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1392 1413
1414void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1415{
1416 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
1417
1418 if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
1419 nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
1420}
1421
1393/* 1422/*
1394 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 1423 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
1395 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 1424 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
@@ -1403,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1403{ 1432{
1404 struct nfs4_layoutcommit_data *data; 1433 struct nfs4_layoutcommit_data *data;
1405 struct nfs_inode *nfsi = NFS_I(inode); 1434 struct nfs_inode *nfsi = NFS_I(inode);
1406 struct pnfs_layout_segment *lseg;
1407 struct rpc_cred *cred;
1408 loff_t end_pos; 1435 loff_t end_pos;
1409 int status = 0; 1436 int status = 0;
1410 1437
@@ -1421,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1421 goto out; 1448 goto out;
1422 } 1449 }
1423 1450
1451 INIT_LIST_HEAD(&data->lseg_list);
1424 spin_lock(&inode->i_lock); 1452 spin_lock(&inode->i_lock);
1425 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1453 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1426 spin_unlock(&inode->i_lock); 1454 spin_unlock(&inode->i_lock);
1427 kfree(data); 1455 kfree(data);
1428 goto out; 1456 goto out;
1429 } 1457 }
1430 /*
1431 * Currently only one (whole file) write lseg which is referenced
1432 * in pnfs_set_layoutcommit and will be found.
1433 */
1434 lseg = pnfs_list_write_lseg(inode);
1435 1458
1436 end_pos = lseg->pls_end_pos; 1459 pnfs_list_write_lseg(inode, &data->lseg_list);
1437 cred = lseg->pls_lc_cred; 1460
1438 lseg->pls_end_pos = 0; 1461 end_pos = nfsi->layout->plh_lwb;
1439 lseg->pls_lc_cred = NULL; 1462 nfsi->layout->plh_lwb = 0;
1440 1463
1441 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, 1464 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
1442 sizeof(nfsi->layout->plh_stateid.data)); 1465 sizeof(nfsi->layout->plh_stateid.data));
1443 spin_unlock(&inode->i_lock); 1466 spin_unlock(&inode->i_lock);
1444 1467
1445 data->args.inode = inode; 1468 data->args.inode = inode;
1446 data->lseg = lseg; 1469 data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
1447 data->cred = cred;
1448 nfs_fattr_init(&data->fattr); 1470 nfs_fattr_init(&data->fattr);
1449 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1471 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1450 data->res.fattr = &data->fattr; 1472 data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 078670dfbe04..e0b5d80a43f6 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,16 +36,16 @@
36enum { 36enum {
37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
38 NFS_LSEG_ROC, /* roc bit received from server */ 38 NFS_LSEG_ROC, /* roc bit received from server */
39 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
39}; 40};
40 41
41struct pnfs_layout_segment { 42struct pnfs_layout_segment {
42 struct list_head pls_list; 43 struct list_head pls_list;
44 struct list_head pls_lc_list;
43 struct pnfs_layout_range pls_range; 45 struct pnfs_layout_range pls_range;
44 atomic_t pls_refcount; 46 atomic_t pls_refcount;
45 unsigned long pls_flags; 47 unsigned long pls_flags;
46 struct pnfs_layout_hdr *pls_layout; 48 struct pnfs_layout_hdr *pls_layout;
47 struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
48 loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
49}; 49};
50 50
51enum pnfs_try_status { 51enum pnfs_try_status {
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {
80 struct module *owner; 80 struct module *owner;
81 unsigned flags; 81 unsigned flags;
82 82
83 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
84 int (*clear_layoutdriver) (struct nfs_server *);
85
83 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); 86 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
84 void (*free_layout_hdr) (struct pnfs_layout_hdr *); 87 void (*free_layout_hdr) (struct pnfs_layout_hdr *);
85 88
@@ -110,6 +113,8 @@ struct pnfs_layoutdriver_type {
110 struct xdr_stream *xdr, 113 struct xdr_stream *xdr,
111 const struct nfs4_layoutreturn_args *args); 114 const struct nfs4_layoutreturn_args *args);
112 115
116 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
117
113 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 118 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
114 struct xdr_stream *xdr, 119 struct xdr_stream *xdr,
115 const struct nfs4_layoutcommit_args *args); 120 const struct nfs4_layoutcommit_args *args);
@@ -125,6 +130,8 @@ struct pnfs_layout_hdr {
125 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ 130 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
126 u32 plh_barrier; /* ignore lower seqids */ 131 u32 plh_barrier; /* ignore lower seqids */
127 unsigned long plh_flags; 132 unsigned long plh_flags;
133 loff_t plh_lwb; /* last write byte for layoutcommit */
134 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
128 struct inode *plh_inode; 135 struct inode *plh_inode;
129}; 136};
130 137
@@ -137,10 +144,21 @@ struct pnfs_device {
137 unsigned int pglen; 144 unsigned int pglen;
138}; 145};
139 146
147#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
148
149struct pnfs_devicelist {
150 unsigned int eof;
151 unsigned int num_devs;
152 struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
153};
154
140extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); 155extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
141extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 156extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
142 157
143/* nfs4proc.c */ 158/* nfs4proc.c */
159extern int nfs4_proc_getdevicelist(struct nfs_server *server,
160 const struct nfs_fh *fh,
161 struct pnfs_devicelist *devlist);
144extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 162extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
145 struct pnfs_device *dev); 163 struct pnfs_device *dev);
146extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 164extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
@@ -153,7 +171,7 @@ void put_lseg(struct pnfs_layout_segment *lseg);
153bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); 171bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
154bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); 172bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
155 173
156void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 174void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
157void unset_pnfs_layoutdriver(struct nfs_server *); 175void unset_pnfs_layoutdriver(struct nfs_server *);
158void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); 176void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
159int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); 177int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -179,6 +197,7 @@ void pnfs_roc_release(struct inode *ino);
179void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 197void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
180bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 198bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
181void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 199void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
200void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
182int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 201int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
183int _pnfs_return_layout(struct inode *); 202int _pnfs_return_layout(struct inode *);
184int pnfs_ld_write_done(struct nfs_write_data *); 203int pnfs_ld_write_done(struct nfs_write_data *);
@@ -360,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)
360 return false; 379 return false;
361} 380}
362 381
363static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) 382static inline void set_pnfs_layoutdriver(struct nfs_server *s,
383 const struct nfs_fh *mntfh, u32 id);
364{ 384{
365} 385}
366 386