Merge branch 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

* 'nfs-for-3.1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (28 commits) pnfsblock: write_pagelist handle zero invalid extents pnfsblock: note written INVAL areas for layoutcommit pnfsblock: bl_write_pagelist pnfsblock: bl_read_pagelist pnfsblock: cleanup_layoutcommit pnfsblock: encode_layoutcommit pnfsblock: merge rw extents pnfsblock: add extent manipulation functions pnfsblock: bl_find_get_extent pnfsblock: xdr decode pnfs_block_layout4 pnfsblock: call and parse getdevicelist pnfsblock: merge extents pnfsblock: lseg alloc and free pnfsblock: remove device operations pnfsblock: add device operations pnfsblock: basic extent code pnfsblock: use pageio_ops api pnfsblock: add blocklayout Kconfig option, Makefile, and stubs pnfs: cleanup_layoutcommit pnfs: ask for layout_blksize and save it in nfs_server ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2011-07-31 12:26:50 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-07-31 12:26:50 -0400
commit: 24c3047095fa3954f114bfff2e37b8fcbb216396 (patch)
tree: a2263a4425d511ae619ca8b055705261dab9ec12 /fs
parent: 6581058f44533f9d45548bcfe986c125376859e9 (diff)
parent: 71cdd40fd498f12679070def668f6a4719ddbd1c (diff)
16 files changed, 3090 insertions, 87 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2cde5d95475..be020771c6b 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -79,15 +79,21 @@ config NFS_V4_1
        depends on NFS_FS && NFS_V4 && EXPERIMENTAL
        select SUNRPC_BACKCHANNEL
        select PNFS_FILE_LAYOUT
+        select PNFS_BLOCK
+        select MD
+        select BLK_DEV_DM
        help
          This option enables support for minor version 1 of the NFSv4 protocol
-          (RFC 5661) in the kernel's NFS client.
+          (RFC 5661 and RFC 5663) in the kernel's NFS client.
          If unsure, say N.
 config PNFS_FILE_LAYOUT
        tristate
+config PNFS_BLOCK
+        tristate
 config PNFS_OBJLAYOUT
        tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
        depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6a34f7dd0e6..b58613d0abb 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 00000000000..d5815505c02
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 00000000000..e56564d2ef9
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1019 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/bio.h>          /* struct bio */
+#include <linux/buffer_head.h>  /* various write calls */
+#include "blocklayout.h"
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+struct dentry *bl_device_pipe;
+wait_queue_head_t bl_wq;
+static void print_page(struct page *page)
+{
+        dprintk("PRINTPAGE page %p\n", page);
+        dprintk("       PagePrivate %d\n", PagePrivate(page));
+        dprintk("       PageUptodate %d\n", PageUptodate(page));
+        dprintk("       PageError %d\n", PageError(page));
+        dprintk("       PageDirty %d\n", PageDirty(page));
+        dprintk("       PageReferenced %d\n", PageReferenced(page));
+        dprintk("       PageLocked %d\n", PageLocked(page));
+        dprintk("       PageWriteback %d\n", PageWriteback(page));
+        dprintk("       PageMappedToDisk %d\n", PageMappedToDisk(page));
+        dprintk("\n");
+}
+/* Given the be associated with isect, determine if page data needs to be
+ * initialized.
+ */
+static int is_hole(struct pnfs_block_extent *be, sector_t isect)
+{
+        if (be->be_state == PNFS_BLOCK_NONE_DATA)
+                return 1;
+        else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+                return 0;
+        else
+                return !bl_is_sector_init(be->be_inval, isect);
+}
+/* Given the be associated with isect, determine if page data can be
+ * written to disk.
+ */
+static int is_writable(struct pnfs_block_extent *be, sector_t isect)
+{
+        return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+                be->be_state == PNFS_BLOCK_INVALID_DATA);
+}
+/* The data we are handed might be spread across several bios.  We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+        struct kref refcnt;
+        struct rpc_call_ops call_ops;
+        void (*pnfs_callback) (void *data);
+        void *data;
+};
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+        struct parallel_io *rv;
+        rv  = kmalloc(sizeof(*rv), GFP_NOFS);
+        if (rv) {
+                rv->data = data;
+                kref_init(&rv->refcnt);
+        }
+        return rv;
+}
+static inline void get_parallel(struct parallel_io *p)
+{
+        kref_get(&p->refcnt);
+}
+static void destroy_parallel(struct kref *kref)
+{
+        struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+        dprintk("%s enter\n", __func__);
+        p->pnfs_callback(p->data);
+        kfree(p);
+}
+static inline void put_parallel(struct parallel_io *p)
+{
+        kref_put(&p->refcnt, destroy_parallel);
+}
+static struct bio *
+bl_submit_bio(int rw, struct bio *bio)
+{
+        if (bio) {
+                get_parallel(bio->bi_private);
+                dprintk("%s submitting %s bio %u@%llu\n", __func__,
+                        rw == READ ? "read" : "write",
+                        bio->bi_size, (unsigned long long)bio->bi_sector);
+                submit_bio(rw, bio);
+        }
+        return NULL;
+}
+static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
+                                     struct pnfs_block_extent *be,
+                                     void (*end_io)(struct bio *, int err),
+                                     struct parallel_io *par)
+{
+        struct bio *bio;
+        bio = bio_alloc(GFP_NOIO, npg);
+        if (!bio)
+                return NULL;
+        bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+        bio->bi_bdev = be->be_mdev;
+        bio->bi_end_io = end_io;
+        bio->bi_private = par;
+        return bio;
+}
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+                                      sector_t isect, struct page *page,
+                                      struct pnfs_block_extent *be,
+                                      void (*end_io)(struct bio *, int err),
+                                      struct parallel_io *par)
+{
+retry:
+        if (!bio) {
+                bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+                if (!bio)
+                        return ERR_PTR(-ENOMEM);
+        }
+        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+                bio = bl_submit_bio(rw, bio);
+                goto retry;
+        }
+        return bio;
+}
+static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+        } else {
+                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+        }
+}
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_read(struct bio *bio, int err)
+{
+        struct parallel_io *par = bio->bi_private;
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (uptodate)
+                        SetPageUptodate(page);
+        } while (bvec >= bio->bi_io_vec);
+        if (!uptodate) {
+                if (!rdata->pnfs_error)
+                        rdata->pnfs_error = -EIO;
+                bl_set_lo_fail(rdata->lseg);
+        }
+        bio_put(bio);
+        put_parallel(par);
+}
+static void bl_read_cleanup(struct work_struct *work)
+{
+        struct rpc_task *task;
+        struct nfs_read_data *rdata;
+        dprintk("%s enter\n", __func__);
+        task = container_of(work, struct rpc_task, u.tk_work);
+        rdata = container_of(task, struct nfs_read_data, task);
+        pnfs_ld_read_done(rdata);
+}
+static void
+bl_end_par_io_read(void *data)
+{
+        struct nfs_read_data *rdata = data;
+        INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
+        schedule_work(&rdata->task.u.tk_work);
+}
+/* We don't want normal .rpc_call_done callback used, so we replace it
+ * with this stub.
+ */
+static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
+{
+        return;
+}
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_read_data *rdata)
+{
+        int i, hole;
+        struct bio *bio = NULL;
+        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+        sector_t isect, extent_length = 0;
+        struct parallel_io *par;
+        loff_t f_offset = rdata->args.offset;
+        size_t count = rdata->args.count;
+        struct page **pages = rdata->args.pages;
+        int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+        dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
+               rdata->npages, f_offset, count);
+        par = alloc_parallel(rdata);
+        if (!par)
+                goto use_mds;
+        par->call_ops = *rdata->mds_ops;
+        par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+        par->pnfs_callback = bl_end_par_io_read;
+        /* At this point, we can no longer jump to use_mds */
+        isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+        /* Code assumes extents are page-aligned */
+        for (i = pg_index; i < rdata->npages; i++) {
+                if (!extent_length) {
+                        /* We've used up the previous extent */
+                        bl_put_extent(be);
+                        bl_put_extent(cow_read);
+                        bio = bl_submit_bio(READ, bio);
+                        /* Get the next one */
+                        be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
+                                             isect, &cow_read);
+                        if (!be) {
+                                rdata->pnfs_error = -EIO;
+                                goto out;
+                        }
+                        extent_length = be->be_length -
+                                (isect - be->be_f_offset);
+                        if (cow_read) {
+                                sector_t cow_length = cow_read->be_length -
+                                        (isect - cow_read->be_f_offset);
+                                extent_length = min(extent_length, cow_length);
+                        }
+                }
+                hole = is_hole(be, isect);
+                if (hole && !cow_read) {
+                        bio = bl_submit_bio(READ, bio);
+                        /* Fill hole w/ zeroes w/o accessing device */
+                        dprintk("%s Zeroing page for hole\n", __func__);
+                        zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+                        print_page(pages[i]);
+                        SetPageUptodate(pages[i]);
+                } else {
+                        struct pnfs_block_extent *be_read;
+                        be_read = (hole && cow_read) ? cow_read : be;
+                        bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
+                                                 isect, pages[i], be_read,
+                                                 bl_end_io_read, par);
+                        if (IS_ERR(bio)) {
+                                rdata->pnfs_error = PTR_ERR(bio);
+                                goto out;
+                        }
+                }
+                isect += PAGE_CACHE_SECTORS;
+                extent_length -= PAGE_CACHE_SECTORS;
+        }
+        if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
+                rdata->res.eof = 1;
+                rdata->res.count = rdata->inode->i_size - f_offset;
+        } else {
+                rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
+        }
+out:
+        bl_put_extent(be);
+        bl_put_extent(cow_read);
+        bl_submit_bio(READ, bio);
+        put_parallel(par);
+        return PNFS_ATTEMPTED;
+ use_mds:
+        dprintk("Giving up and using normal NFS\n");
+        return PNFS_NOT_ATTEMPTED;
+}
+static void mark_extents_written(struct pnfs_block_layout *bl,
+                                 __u64 offset, __u32 count)
+{
+        sector_t isect, end;
+        struct pnfs_block_extent *be;
+        dprintk("%s(%llu, %u)\n", __func__, offset, count);
+        if (count == 0)
+                return;
+        isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
+        end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
+        end >>= SECTOR_SHIFT;
+        while (isect < end) {
+                sector_t len;
+                be = bl_find_get_extent(bl, isect, NULL);
+                BUG_ON(!be); /* FIXME */
+                len = min(end, be->be_f_offset + be->be_length) - isect;
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+                        bl_mark_for_commit(be, isect, len); /* What if fails? */
+                isect += len;
+                bl_put_extent(be);
+        }
+}
+static void bl_end_io_write_zero(struct bio *bio, int err)
+{
+        struct parallel_io *par = bio->bi_private;
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                /* This is the zeroing page we added */
+                end_page_writeback(page);
+                page_cache_release(page);
+        } while (bvec >= bio->bi_io_vec);
+        if (!uptodate) {
+                if (!wdata->pnfs_error)
+                        wdata->pnfs_error = -EIO;
+                bl_set_lo_fail(wdata->lseg);
+        }
+        bio_put(bio);
+        put_parallel(par);
+}
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_write(struct bio *bio, int err)
+{
+        struct parallel_io *par = bio->bi_private;
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+        if (!uptodate) {
+                if (!wdata->pnfs_error)
+                        wdata->pnfs_error = -EIO;
+                bl_set_lo_fail(wdata->lseg);
+        }
+        bio_put(bio);
+        put_parallel(par);
+}
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+        struct rpc_task *task;
+        struct nfs_write_data *wdata;
+        dprintk("%s enter\n", __func__);
+        task = container_of(work, struct rpc_task, u.tk_work);
+        wdata = container_of(task, struct nfs_write_data, task);
+        if (!wdata->pnfs_error) {
+                /* Marks for LAYOUTCOMMIT */
+                mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                     wdata->args.offset, wdata->args.count);
+        }
+        pnfs_ld_write_done(wdata);
+}
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
+{
+        struct nfs_write_data *wdata = data;
+        wdata->task.tk_status = 0;
+        wdata->verf.committed = NFS_FILE_SYNC;
+        INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
+        schedule_work(&wdata->task.u.tk_work);
+}
+/* FIXME STUB - mark intersection of layout and page as bad, so is not
+ * used again.
+ */
+static void mark_bad_read(void)
+{
+        return;
+}
+/*
+ * map_block:  map a requested I/0 block (isect) into an offset in the LVM
+ * block_device
+ */
+static void
+map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
+{
+        dprintk("%s enter be=%p\n", __func__, be);
+        set_buffer_mapped(bh);
+        bh->b_bdev = be->be_mdev;
+        bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
+            (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
+        dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
+                __func__, (unsigned long long)isect, (long)bh->b_blocknr,
+                bh->b_size);
+        return;
+}
+/* Given an unmapped page, zero it or read in page for COW, page is locked
+ * by caller.
+ */
+static int
+init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
+{
+        struct buffer_head *bh = NULL;
+        int ret = 0;
+        sector_t isect;
+        dprintk("%s enter, %p\n", __func__, page);
+        BUG_ON(PageUptodate(page));
+        if (!cow_read) {
+                zero_user_segment(page, 0, PAGE_SIZE);
+                SetPageUptodate(page);
+                goto cleanup;
+        }
+        bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
+        if (!bh) {
+                ret = -ENOMEM;
+                goto cleanup;
+        }
+        isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
+        map_block(bh, isect, cow_read);
+        if (!bh_uptodate_or_lock(bh))
+                ret = bh_submit_read(bh);
+        if (ret)
+                goto cleanup;
+        SetPageUptodate(page);
+cleanup:
+        bl_put_extent(cow_read);
+        if (bh)
+                free_buffer_head(bh);
+        if (ret) {
+                /* Need to mark layout with bad read...should now
+                 * just use nfs4 for reads and writes.
+                 */
+                mark_bad_read();
+        }
+        return ret;
+}
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_write_data *wdata, int sync)
+{
+        int i, ret, npg_zero, pg_index, last = 0;
+        struct bio *bio = NULL;
+        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+        sector_t isect, last_isect = 0, extent_length = 0;
+        struct parallel_io *par;
+        loff_t offset = wdata->args.offset;
+        size_t count = wdata->args.count;
+        struct page **pages = wdata->args.pages;
+        struct page *page;
+        pgoff_t index;
+        u64 temp;
+        int npg_per_block =
+            NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+        dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+        /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+         * We want to write each, and if there is an error set pnfs_error
+         * to have it redone using nfs.
+         */
+        par = alloc_parallel(wdata);
+        if (!par)
+                return PNFS_NOT_ATTEMPTED;
+        par->call_ops = *wdata->mds_ops;
+        par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+        par->pnfs_callback = bl_end_par_io_write;
+        /* At this point, have to be more careful with error handling */
+        isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+        be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
+        if (!be || !is_writable(be, isect)) {
+                dprintk("%s no matching extents!\n", __func__);
+                wdata->pnfs_error = -EINVAL;
+                goto out;
+        }
+        /* First page inside INVALID extent */
+        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                temp = offset >> PAGE_CACHE_SHIFT;
+                npg_zero = do_div(temp, npg_per_block);
+                isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
+                                     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+                extent_length = be->be_length - (isect - be->be_f_offset);
+fill_invalid_ext:
+                dprintk("%s need to zero %d pages\n", __func__, npg_zero);
+                for (;npg_zero > 0; npg_zero--) {
+                        /* page ref released in bl_end_io_write_zero */
+                        index = isect >> PAGE_CACHE_SECTOR_SHIFT;
+                        dprintk("%s zero %dth page: index %lu isect %llu\n",
+                                __func__, npg_zero, index,
+                                (unsigned long long)isect);
+                        page =
+                            find_or_create_page(wdata->inode->i_mapping, index,
+                                                GFP_NOFS);
+                        if (!page) {
+                                dprintk("%s oom\n", __func__);
+                                wdata->pnfs_error = -ENOMEM;
+                                goto out;
+                        }
+                        /* PageDirty: Other will write this out
+                         * PageWriteback: Other is writing this out
+                         * PageUptodate: It was read before
+                         * sector_initialized: already written out
+                         */
+                        if (PageDirty(page) || PageWriteback(page) ||
+                            bl_is_sector_init(be->be_inval, isect)) {
+                                print_page(page);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto next_page;
+                        }
+                        if (!PageUptodate(page)) {
+                                /* New page, readin or zero it */
+                                init_page_for_write(page, cow_read);
+                        }
+                        set_page_writeback(page);
+                        unlock_page(page);
+                        ret = bl_mark_sectors_init(be->be_inval, isect,
+                                                       PAGE_CACHE_SECTORS,
+                                                       NULL);
+                        if (unlikely(ret)) {
+                                dprintk("%s bl_mark_sectors_init fail %d\n",
+                                        __func__, ret);
+                                end_page_writeback(page);
+                                page_cache_release(page);
+                                wdata->pnfs_error = ret;
+                                goto out;
+                        }
+                        bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
+                                                 isect, page, be,
+                                                 bl_end_io_write_zero, par);
+                        if (IS_ERR(bio)) {
+                                wdata->pnfs_error = PTR_ERR(bio);
+                                goto out;
+                        }
+                        /* FIXME: This should be done in bi_end_io */
+                        mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                             page->index << PAGE_CACHE_SHIFT,
+                                             PAGE_CACHE_SIZE);
+next_page:
+                        isect += PAGE_CACHE_SECTORS;
+                        extent_length -= PAGE_CACHE_SECTORS;
+                }
+                if (last)
+                        goto write_done;
+        }
+        bio = bl_submit_bio(WRITE, bio);
+        /* Middle pages */
+        pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+        for (i = pg_index; i < wdata->npages; i++) {
+                if (!extent_length) {
+                        /* We've used up the previous extent */
+                        bl_put_extent(be);
+                        bio = bl_submit_bio(WRITE, bio);
+                        /* Get the next one */
+                        be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
+                                             isect, NULL);
+                        if (!be || !is_writable(be, isect)) {
+                                wdata->pnfs_error = -EINVAL;
+                                goto out;
+                        }
+                        extent_length = be->be_length -
+                            (isect - be->be_f_offset);
+                }
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                        ret = bl_mark_sectors_init(be->be_inval, isect,
+                                                       PAGE_CACHE_SECTORS,
+                                                       NULL);
+                        if (unlikely(ret)) {
+                                dprintk("%s bl_mark_sectors_init fail %d\n",
+                                        __func__, ret);
+                                wdata->pnfs_error = ret;
+                                goto out;
+                        }
+                }
+                bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
+                                         isect, pages[i], be,
+                                         bl_end_io_write, par);
+                if (IS_ERR(bio)) {
+                        wdata->pnfs_error = PTR_ERR(bio);
+                        goto out;
+                }
+                isect += PAGE_CACHE_SECTORS;
+                last_isect = isect;
+                extent_length -= PAGE_CACHE_SECTORS;
+        }
+        /* Last page inside INVALID extent */
+        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                bio = bl_submit_bio(WRITE, bio);
+                temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
+                npg_zero = npg_per_block - do_div(temp, npg_per_block);
+                if (npg_zero < npg_per_block) {
+                        last = 1;
+                        goto fill_invalid_ext;
+                }
+        }
+write_done:
+        wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
+        if (count < wdata->res.count) {
+                wdata->res.count = count;
+        }
+out:
+        bl_put_extent(be);
+        bl_submit_bio(WRITE, bio);
+        put_parallel(par);
+        return PNFS_ATTEMPTED;
+}
+/* FIXME - range ignored */
+static void
+release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
+{
+        int i;
+        struct pnfs_block_extent *be;
+        spin_lock(&bl->bl_ext_lock);
+        for (i = 0; i < EXTENT_LISTS; i++) {
+                while (!list_empty(&bl->bl_extents[i])) {
+                        be = list_first_entry(&bl->bl_extents[i],
+                                              struct pnfs_block_extent,
+                                              be_node);
+                        list_del(&be->be_node);
+                        bl_put_extent(be);
+                }
+        }
+        spin_unlock(&bl->bl_ext_lock);
+}
+static void
+release_inval_marks(struct pnfs_inval_markings *marks)
+{
+        struct pnfs_inval_tracking *pos, *temp;
+        list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
+                list_del(&pos->it_link);
+                kfree(pos);
+        }
+        return;
+}
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+        dprintk("%s enter\n", __func__);
+        release_extents(bl, NULL);
+        release_inval_marks(&bl->bl_inval);
+        kfree(bl);
+}
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+                                                   gfp_t gfp_flags)
+{
+        struct pnfs_block_layout *bl;
+        dprintk("%s enter\n", __func__);
+        bl = kzalloc(sizeof(*bl), gfp_flags);
+        if (!bl)
+                return NULL;
+        spin_lock_init(&bl->bl_ext_lock);
+        INIT_LIST_HEAD(&bl->bl_extents[0]);
+        INIT_LIST_HEAD(&bl->bl_extents[1]);
+        INIT_LIST_HEAD(&bl->bl_commit);
+        INIT_LIST_HEAD(&bl->bl_committing);
+        bl->bl_count = 0;
+        bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
+        BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+        return &bl->bl_layout;
+}
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        dprintk("%s enter\n", __func__);
+        kfree(lseg);
+}
+/* We pretty much ignore lseg, and store all data layout wide, so we
+ * can correctly merge.
+ */
+static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+                                                 struct nfs4_layoutget_res *lgr,
+                                                 gfp_t gfp_flags)
+{
+        struct pnfs_layout_segment *lseg;
+        int status;
+        dprintk("%s enter\n", __func__);
+        lseg = kzalloc(sizeof(*lseg), gfp_flags);
+        if (!lseg)
+                return ERR_PTR(-ENOMEM);
+        status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+        if (status) {
+                /* We don't want to call the full-blown bl_free_lseg,
+                 * since on error extents were not touched.
+                 */
+                kfree(lseg);
+                return ERR_PTR(status);
+        }
+        return lseg;
+}
+static void
+bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
+                       const struct nfs4_layoutcommit_args *arg)
+{
+        dprintk("%s enter\n", __func__);
+        encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+}
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+        struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+        dprintk("%s enter\n", __func__);
+        clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
+}
+static void free_blk_mountid(struct block_mount_id *mid)
+{
+        if (mid) {
+                struct pnfs_block_dev *dev;
+                spin_lock(&mid->bm_lock);
+                while (!list_empty(&mid->bm_devlist)) {
+                        dev = list_first_entry(&mid->bm_devlist,
+                                               struct pnfs_block_dev,
+                                               bm_node);
+                        list_del(&dev->bm_node);
+                        bl_free_block_dev(dev);
+                }
+                spin_unlock(&mid->bm_lock);
+                kfree(mid);
+        }
+}
+/* This is mostly copied from the filelayout's get_device_info function.
+ * It seems much of this should be at the generic pnfs level.
+ */
+static struct pnfs_block_dev *
+nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
+                        struct nfs4_deviceid *d_id)
+{
+        struct pnfs_device *dev;
+        struct pnfs_block_dev *rv = NULL;
+        u32 max_resp_sz;
+        int max_pages;
+        struct page **pages = NULL;
+        int i, rc;
+        /*
+         * Use the session max response size as the basis for setting
+         * GETDEVICEINFO's maxcount
+         */
+        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        max_pages = max_resp_sz >> PAGE_SHIFT;
+        dprintk("%s max_resp_sz %u max_pages %d\n",
+                __func__, max_resp_sz, max_pages);
+        dev = kmalloc(sizeof(*dev), GFP_NOFS);
+        if (!dev) {
+                dprintk("%s kmalloc failed\n", __func__);
+                return NULL;
+        }
+        pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+        if (pages == NULL) {
+                kfree(dev);
+                return NULL;
+        }
+        for (i = 0; i < max_pages; i++) {
+                pages[i] = alloc_page(GFP_NOFS);
+                if (!pages[i])
+                        goto out_free;
+        }
+        memcpy(&dev->dev_id, d_id, sizeof(*d_id));
+        dev->layout_type = LAYOUT_BLOCK_VOLUME;
+        dev->pages = pages;
+        dev->pgbase = 0;
+        dev->pglen = PAGE_SIZE * max_pages;
+        dev->mincount = 0;
+        dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
+        rc = nfs4_proc_getdeviceinfo(server, dev);
+        dprintk("%s getdevice info returns %d\n", __func__, rc);
+        if (rc)
+                goto out_free;
+        rv = nfs4_blk_decode_device(server, dev);
+ out_free:
+        for (i = 0; i < max_pages; i++)
+                __free_page(pages[i]);
+        kfree(pages);
+        kfree(dev);
+        return rv;
+}
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+        struct block_mount_id *b_mt_id = NULL;
+        struct pnfs_devicelist *dlist = NULL;
+        struct pnfs_block_dev *bdev;
+        LIST_HEAD(block_disklist);
+        int status = 0, i;
+        dprintk("%s enter\n", __func__);
+        if (server->pnfs_blksize == 0) {
+                dprintk("%s Server did not return blksize\n", __func__);
+                return -EINVAL;
+        }
+        b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
+        if (!b_mt_id) {
+                status = -ENOMEM;
+                goto out_error;
+        }
+        /* Initialize nfs4 block layout mount id */
+        spin_lock_init(&b_mt_id->bm_lock);
+        INIT_LIST_HEAD(&b_mt_id->bm_devlist);
+        dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
+        if (!dlist) {
+                status = -ENOMEM;
+                goto out_error;
+        }
+        dlist->eof = 0;
+        while (!dlist->eof) {
+                status = nfs4_proc_getdevicelist(server, fh, dlist);
+                if (status)
+                        goto out_error;
+                dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
+                        __func__, dlist->num_devs, dlist->eof);
+                for (i = 0; i < dlist->num_devs; i++) {
+                        bdev = nfs4_blk_get_deviceinfo(server, fh,
+                                                       &dlist->dev_id[i]);
+                        if (!bdev) {
+                                status = -ENODEV;
+                                goto out_error;
+                        }
+                        spin_lock(&b_mt_id->bm_lock);
+                        list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
+                        spin_unlock(&b_mt_id->bm_lock);
+                }
+        }
+        dprintk("%s SUCCESS\n", __func__);
+        server->pnfs_ld_data = b_mt_id;
+ out_return:
+        kfree(dlist);
+        return status;
+ out_error:
+        free_blk_mountid(b_mt_id);
+        goto out_return;
+}
+static int
+bl_clear_layoutdriver(struct nfs_server *server)
+{
+        struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+        dprintk("%s enter\n", __func__);
+        free_blk_mountid(b_mt_id);
+        dprintk("%s RETURNS\n", __func__);
+        return 0;
+}
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+        .pg_init = pnfs_generic_pg_init_read,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_readpages,
+};
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+        .pg_init = pnfs_generic_pg_init_write,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_writepages,
+};
+static struct pnfs_layoutdriver_type blocklayout_type = {
+        .id                             = LAYOUT_BLOCK_VOLUME,
+        .name                           = "LAYOUT_BLOCK_VOLUME",
+        .read_pagelist                  = bl_read_pagelist,
+        .write_pagelist                 = bl_write_pagelist,
+        .alloc_layout_hdr               = bl_alloc_layout_hdr,
+        .free_layout_hdr                = bl_free_layout_hdr,
+        .alloc_lseg                     = bl_alloc_lseg,
+        .free_lseg                      = bl_free_lseg,
+        .encode_layoutcommit            = bl_encode_layoutcommit,
+        .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
+        .set_layoutdriver               = bl_set_layoutdriver,
+        .clear_layoutdriver             = bl_clear_layoutdriver,
+        .pg_read_ops                    = &bl_pg_read_ops,
+        .pg_write_ops                   = &bl_pg_write_ops,
+};
+static const struct rpc_pipe_ops bl_upcall_ops = {
+        .upcall         = bl_pipe_upcall,
+        .downcall       = bl_pipe_downcall,
+        .destroy_msg    = bl_pipe_destroy_msg,
+};
+static int __init nfs4blocklayout_init(void)
+{
+        struct vfsmount *mnt;
+        struct path path;
+        int ret;
+        dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+        ret = pnfs_register_layoutdriver(&blocklayout_type);
+        if (ret)
+                goto out;
+        init_waitqueue_head(&bl_wq);
+        mnt = rpc_get_mount();
+        if (IS_ERR(mnt)) {
+                ret = PTR_ERR(mnt);
+                goto out_remove;
+        }
+        ret = vfs_path_lookup(mnt->mnt_root,
+                              mnt,
+                              NFS_PIPE_DIRNAME, 0, &path);
+        if (ret)
+                goto out_remove;
+        bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
+                                    &bl_upcall_ops, 0);
+        if (IS_ERR(bl_device_pipe)) {
+                ret = PTR_ERR(bl_device_pipe);
+                goto out_remove;
+        }
+out:
+        return ret;
+out_remove:
+        pnfs_unregister_layoutdriver(&blocklayout_type);
+        return ret;
+}
+static void __exit nfs4blocklayout_exit(void)
+{
+        dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+               __func__);
+        pnfs_unregister_layoutdriver(&blocklayout_type);
+        rpc_unlink(bl_device_pipe);
+}
+MODULE_ALIAS("nfs-layouttype4-3");
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 00000000000..f27d827960a
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,207 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "../pnfs.h"
+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+struct block_mount_id {
+        spinlock_t                      bm_lock;    /* protects list */
+        struct list_head                bm_devlist; /* holds pnfs_block_dev */
+};
+struct pnfs_block_dev {
+        struct list_head                bm_node;
+        struct nfs4_deviceid            bm_mdevid;    /* associated devid */
+        struct block_device             *bm_mdev;     /* meta device itself */
+};
+enum exstate4 {
+        PNFS_BLOCK_READWRITE_DATA       = 0,
+        PNFS_BLOCK_READ_DATA            = 1,
+        PNFS_BLOCK_INVALID_DATA         = 2, /* mapped, but data is invalid */
+        PNFS_BLOCK_NONE_DATA            = 3  /* unmapped, it's a hole */
+};
+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+struct my_tree {
+        sector_t                mtt_step_size;  /* Internal sector alignment */
+        struct list_head        mtt_stub; /* Should be a radix tree */
+};
+struct pnfs_inval_markings {
+        spinlock_t      im_lock;
+        struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
+        sector_t        im_block_size;  /* Server blocksize in sectors */
+};
+struct pnfs_inval_tracking {
+        struct list_head it_link;
+        int              it_sector;
+        int              it_tags;
+};
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+        struct kref     be_refcnt;
+        struct list_head be_node;       /* link into lseg list */
+        struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */
+        struct block_device *be_mdev;
+        sector_t        be_f_offset;    /* the starting offset in the file */
+        sector_t        be_length;      /* the size of the extent */
+        sector_t        be_v_offset;    /* the starting offset in the volume */
+        enum exstate4   be_state;       /* the state of this extent */
+        struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+};
+/* Shortened extent used by LAYOUTCOMMIT */
+struct pnfs_block_short_extent {
+        struct list_head bse_node;
+        struct nfs4_deviceid bse_devid;
+        struct block_device *bse_mdev;
+        sector_t        bse_f_offset;   /* the starting offset in the file */
+        sector_t        bse_length;     /* the size of the extent */
+};
+static inline void
+BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
+{
+        spin_lock_init(&marks->im_lock);
+        INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+        marks->im_block_size = blocksize;
+        marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
+                                           blocksize);
+}
+enum extentclass4 {
+        RW_EXTENT       = 0, /* READWRTE and INVAL */
+        RO_EXTENT       = 1, /* READ and NONE */
+        EXTENT_LISTS    = 2,
+};
+static inline int bl_choose_list(enum exstate4 state)
+{
+        if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
+                return RO_EXTENT;
+        else
+                return RW_EXTENT;
+}
+struct pnfs_block_layout {
+        struct pnfs_layout_hdr bl_layout;
+        struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+        spinlock_t              bl_ext_lock;   /* Protects list manipulation */
+        struct list_head        bl_extents[EXTENT_LISTS]; /* R and RW extents */
+        struct list_head        bl_commit;      /* Needs layout commit */
+        struct list_head        bl_committing;  /* Layout committing */
+        unsigned int            bl_count;       /* entries in bl_commit */
+        sector_t                bl_blocksize;  /* Server blocksize in sectors */
+};
+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+        return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+        return BLK_LO2EXT(lseg->pls_layout);
+}
+struct bl_dev_msg {
+        int status;
+        uint32_t major, minor;
+};
+struct bl_msg_hdr {
+        u8  type;
+        u16 totallen; /* length of entire message, including hdr itself */
+};
+extern struct dentry *bl_device_pipe;
+extern wait_queue_head_t bl_wq;
+#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
+/* blocklayoutdev.c */
+ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+                       char __user *, size_t);
+ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
+struct block_device *nfs4_blkdev_get(dev_t dev);
+int nfs4_blkdev_put(struct block_device *bdev);
+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
+                                                struct pnfs_device *dev);
+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+                                struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+/* blocklayoutdm.c */
+void bl_free_block_dev(struct pnfs_block_dev *bdev);
+/* extents.c */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+                struct pnfs_block_extent **cow_read);
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+                             sector_t offset, sector_t length,
+                             sector_t **pages);
+void bl_put_extent(struct pnfs_block_extent *be);
+struct pnfs_block_extent *bl_alloc_extent(void);
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs4_layoutcommit_args *arg);
+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                                   const struct nfs4_layoutcommit_args *arg,
+                                   int status);
+int bl_add_merge_extent(struct pnfs_block_layout *bl,
+                         struct pnfs_block_extent *new);
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+                        sector_t offset, sector_t length);
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644
index 00000000000..a83b393fb01
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -0,0 +1,410 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayoutdev.c
+ *
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h> /* __bread */
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+        uint64_t s;
+        *rp = xdr_decode_hyper(*rp, &s);
+        if (s & 0x1ff) {
+                printk(KERN_WARNING "%s: sector not aligned\n", __func__);
+                return -1;
+        }
+        *sp = s >> SECTOR_SHIFT;
+        return 0;
+}
+/* Open a block_device by device number. */
+struct block_device *nfs4_blkdev_get(dev_t dev)
+{
+        struct block_device *bd;
+        dprintk("%s enter\n", __func__);
+        bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+        if (IS_ERR(bd))
+                goto fail;
+        return bd;
+fail:
+        dprintk("%s failed to open device : %ld\n",
+                        __func__, PTR_ERR(bd));
+        return NULL;
+}
+/*
+ * Release the block device
+ */
+int nfs4_blkdev_put(struct block_device *bdev)
+{
+        dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
+                        MINOR(bdev->bd_dev));
+        return blkdev_put(bdev, FMODE_READ);
+}
+/*
+ * Shouldn't there be a rpc_generic_upcall() to do this for us?
+ */
+ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                       char __user *dst, size_t buflen)
+{
+        char *data = (char *)msg->data + msg->copied;
+        size_t mlen = min(msg->len - msg->copied, buflen);
+        unsigned long left;
+        left = copy_to_user(dst, data, mlen);
+        if (left == mlen) {
+                msg->errno = -EFAULT;
+                return -EFAULT;
+        }
+        mlen -= left;
+        msg->copied += mlen;
+        msg->errno = 0;
+        return mlen;
+}
+static struct bl_dev_msg bl_mount_reply;
+ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+                         size_t mlen)
+{
+        if (mlen != sizeof (struct bl_dev_msg))
+                return -EINVAL;
+        if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
+                return -EFAULT;
+        wake_up(&bl_wq);
+        return mlen;
+}
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+        if (msg->errno >= 0)
+                return;
+        wake_up(&bl_wq);
+}
+/*
+ * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
+ */
+struct pnfs_block_dev *
+nfs4_blk_decode_device(struct nfs_server *server,
+                       struct pnfs_device *dev)
+{
+        struct pnfs_block_dev *rv = NULL;
+        struct block_device *bd = NULL;
+        struct rpc_pipe_msg msg;
+        struct bl_msg_hdr bl_msg = {
+                .type = BL_DEVICE_MOUNT,
+                .totallen = dev->mincount,
+        };
+        uint8_t *dataptr;
+        DECLARE_WAITQUEUE(wq, current);
+        struct bl_dev_msg *reply = &bl_mount_reply;
+        int offset, len, i;
+        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+        dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+                dev->mincount);
+        memset(&msg, 0, sizeof(msg));
+        msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+        if (!msg.data) {
+                rv = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+        dataptr = (uint8_t *) msg.data;
+        len = dev->mincount;
+        offset = sizeof(bl_msg);
+        for (i = 0; len > 0; i++) {
+                memcpy(&dataptr[offset], page_address(dev->pages[i]),
+                                len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
+                len -= PAGE_CACHE_SIZE;
+                offset += PAGE_CACHE_SIZE;
+        }
+        msg.len = sizeof(bl_msg) + dev->mincount;
+        dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+        add_wait_queue(&bl_wq, &wq);
+        if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+                remove_wait_queue(&bl_wq, &wq);
+                goto out;
+        }
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&bl_wq, &wq);
+        if (reply->status != BL_DEVICE_REQUEST_PROC) {
+                dprintk("%s failed to open device: %d\n",
+                        __func__, reply->status);
+                rv = ERR_PTR(-EINVAL);
+                goto out;
+        }
+        bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
+        if (IS_ERR(bd)) {
+                dprintk("%s failed to open device : %ld\n",
+                        __func__, PTR_ERR(bd));
+                goto out;
+        }
+        rv = kzalloc(sizeof(*rv), GFP_NOFS);
+        if (!rv) {
+                rv = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        rv->bm_mdev = bd;
+        memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+        dprintk("%s Created device %s with bd_block_size %u\n",
+                __func__,
+                bd->bd_disk->disk_name,
+                bd->bd_block_size);
+out:
+        kfree(msg.data);
+        return rv;
+}
+/* Map deviceid returned by the server to constructed block_device */
+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
+                                            struct nfs4_deviceid *id)
+{
+        struct block_device *rv = NULL;
+        struct block_mount_id *mid;
+        struct pnfs_block_dev *dev;
+        dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
+        mid = BLK_ID(lo);
+        spin_lock(&mid->bm_lock);
+        list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
+                if (memcmp(id->data, dev->bm_mdevid.data,
+                           NFS4_DEVICEID4_SIZE) == 0) {
+                        rv = dev->bm_mdev;
+                        goto out;
+                }
+        }
+ out:
+        spin_unlock(&mid->bm_lock);
+        dprintk("%s returning %p\n", __func__, rv);
+        return rv;
+}
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+        u32 mode;       /* R or RW */
+        u64 start;      /* Expected start of next non-COW extent */
+        u64 inval;      /* Start of INVAL coverage */
+        u64 cowread;    /* End of COW read coverage */
+};
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+                         struct layout_verification *lv)
+{
+        if (lv->mode == IOMODE_READ) {
+                if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+                    be->be_state == PNFS_BLOCK_INVALID_DATA)
+                        return -EIO;
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                return 0;
+        }
+        /* lv->mode == IOMODE_RW */
+        if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                if (lv->cowread > lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                lv->inval = lv->start;
+                return 0;
+        } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                return 0;
+        } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+                if (be->be_f_offset > lv->start)
+                        return -EIO;
+                if (be->be_f_offset < lv->inval)
+                        return -EIO;
+                if (be->be_f_offset < lv->cowread)
+                        return -EIO;
+                /* It looks like you might want to min this with lv->start,
+                 * but you really don't.
+                 */
+                lv->inval = lv->inval + be->be_length;
+                lv->cowread = be->be_f_offset + be->be_length;
+                return 0;
+        } else
+                return -EIO;
+}
+/* XDR decode pnfs_block_layout4 structure */
+int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+                           struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+        int i, status = -EIO;
+        uint32_t count;
+        struct pnfs_block_extent *be = NULL, *save;
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        struct page *scratch;
+        __be32 *p;
+        struct layout_verification lv = {
+                .mode = lgr->range.iomode,
+                .start = lgr->range.offset >> SECTOR_SHIFT,
+                .inval = lgr->range.offset >> SECTOR_SHIFT,
+                .cowread = lgr->range.offset >> SECTOR_SHIFT,
+        };
+        LIST_HEAD(extents);
+        dprintk("---> %s\n", __func__);
+        scratch = alloc_page(gfp_flags);
+        if (!scratch)
+                return -ENOMEM;
+        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        p = xdr_inline_decode(&stream, 4);
+        if (unlikely(!p))
+                goto out_err;
+        count = be32_to_cpup(p++);
+        dprintk("%s enter, number of extents %i\n", __func__, count);
+        p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
+        if (unlikely(!p))
+                goto out_err;
+        /* Decode individual extents, putting them in temporary
+         * staging area until whole layout is decoded to make error
+         * recovery easier.
+         */
+        for (i = 0; i < count; i++) {
+                be = bl_alloc_extent();
+                if (!be) {
+                        status = -ENOMEM;
+                        goto out_err;
+                }
+                memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
+                p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+                be->be_mdev = translate_devid(lo, &be->be_devid);
+                if (!be->be_mdev)
+                        goto out_err;
+                /* The next three values are read in as bytes,
+                 * but stored as 512-byte sector lengths
+                 */
+                if (decode_sector_number(&p, &be->be_f_offset) < 0)
+                        goto out_err;
+                if (decode_sector_number(&p, &be->be_length) < 0)
+                        goto out_err;
+                if (decode_sector_number(&p, &be->be_v_offset) < 0)
+                        goto out_err;
+                be->be_state = be32_to_cpup(p++);
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+                        be->be_inval = &bl->bl_inval;
+                if (verify_extent(be, &lv)) {
+                        dprintk("%s verify failed\n", __func__);
+                        goto out_err;
+                }
+                list_add_tail(&be->be_node, &extents);
+        }
+        if (lgr->range.offset + lgr->range.length !=
+                        lv.start << SECTOR_SHIFT) {
+                dprintk("%s Final length mismatch\n", __func__);
+                be = NULL;
+                goto out_err;
+        }
+        if (lv.start < lv.cowread) {
+                dprintk("%s Final uncovered COW extent\n", __func__);
+                be = NULL;
+                goto out_err;
+        }
+        /* Extents decoded properly, now try to merge them in to
+         * existing layout extents.
+         */
+        spin_lock(&bl->bl_ext_lock);
+        list_for_each_entry_safe(be, save, &extents, be_node) {
+                list_del(&be->be_node);
+                status = bl_add_merge_extent(bl, be);
+                if (status) {
+                        spin_unlock(&bl->bl_ext_lock);
+                        /* This is a fairly catastrophic error, as the
+                         * entire layout extent lists are now corrupted.
+                         * We should have some way to distinguish this.
+                         */
+                        be = NULL;
+                        goto out_err;
+                }
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        status = 0;
+ out:
+        __free_page(scratch);
+        dprintk("%s returns %i\n", __func__, status);
+        return status;
+ out_err:
+        bl_put_extent(be);
+        while (!list_empty(&extents)) {
+                be = list_first_entry(&extents, struct pnfs_block_extent,
+                                      be_node);
+                list_del(&be->be_node);
+                bl_put_extent(be);
+        }
+        goto out;
+}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644
index 00000000000..d055c755807
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -0,0 +1,111 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayoutdm.c
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2007 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Fred Isaman <iisaman@umich.edu>
+ *  Andy Adamson <andros@citi.umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
+#include <linux/sched.h>
+#include <linux/hash.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static void dev_remove(dev_t dev)
+{
+        struct rpc_pipe_msg msg;
+        struct bl_dev_msg bl_umount_request;
+        struct bl_msg_hdr bl_msg = {
+                .type = BL_DEVICE_UMOUNT,
+                .totallen = sizeof(bl_umount_request),
+        };
+        uint8_t *dataptr;
+        DECLARE_WAITQUEUE(wq, current);
+        dprintk("Entering %s\n", __func__);
+        memset(&msg, 0, sizeof(msg));
+        msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+        if (!msg.data)
+                goto out;
+        memset(&bl_umount_request, 0, sizeof(bl_umount_request));
+        bl_umount_request.major = MAJOR(dev);
+        bl_umount_request.minor = MINOR(dev);
+        memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+        dataptr = (uint8_t *) msg.data;
+        memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
+        msg.len = sizeof(bl_msg) + bl_msg.totallen;
+        add_wait_queue(&bl_wq, &wq);
+        if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+                remove_wait_queue(&bl_wq, &wq);
+                goto out;
+        }
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&bl_wq, &wq);
+out:
+        kfree(msg.data);
+}
+/*
+ * Release meta device
+ */
+static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
+{
+        int rv;
+        dprintk("%s Releasing\n", __func__);
+        rv = nfs4_blkdev_put(bdev->bm_mdev);
+        if (rv)
+                printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
+                                __func__, rv);
+        dev_remove(bdev->bm_mdev->bd_dev);
+}
+void bl_free_block_dev(struct pnfs_block_dev *bdev)
+{
+        if (bdev) {
+                if (bdev->bm_mdev) {
+                        dprintk("%s Removing DM device: %d:%d\n",
+                                __func__,
+                                MAJOR(bdev->bm_mdev->bd_dev),
+                                MINOR(bdev->bm_mdev->bd_dev));
+                        nfs4_blk_metadev_release(bdev);
+                }
+                kfree(bdev);
+        }
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644
index 00000000000..19fa7b0b8c0
--- /dev/null
+++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,935 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/* Bit numbers */
+#define EXTENT_INITIALIZED 0
+#define EXTENT_WRITTEN     1
+#define EXTENT_IN_COMMIT   2
+#define INTERNAL_EXISTS    MY_MAX_TAGS
+#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
+/* Returns largest t<=s s.t. t%base==0 */
+static inline sector_t normalize(sector_t s, int base)
+{
+        sector_t tmp = s; /* Since do_div modifies its argument */
+        return s - do_div(tmp, base);
+}
+static inline sector_t normalize_up(sector_t s, int base)
+{
+        return normalize(s + base - 1, base);
+}
+/* Complete stub using list while determine API wanted */
+/* Returns tags, or negative */
+static int32_t _find_entry(struct my_tree *tree, u64 s)
+{
+        struct pnfs_inval_tracking *pos;
+        dprintk("%s(%llu) enter\n", __func__, s);
+        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+                if (pos->it_sector > s)
+                        continue;
+                else if (pos->it_sector == s)
+                        return pos->it_tags & INTERNAL_MASK;
+                else
+                        break;
+        }
+        return -ENOENT;
+}
+static inline
+int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
+{
+        int32_t tags;
+        dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
+        s = normalize(s, tree->mtt_step_size);
+        tags = _find_entry(tree, s);
+        if ((tags < 0) || !(tags & (1 << tag)))
+                return 0;
+        else
+                return 1;
+}
+/* Creates entry with tag, or if entry already exists, unions tag to it.
+ * If storage is not NULL, newly created entry will use it.
+ * Returns number of entries added, or negative on error.
+ */
+static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
+                      struct pnfs_inval_tracking *storage)
+{
+        int found = 0;
+        struct pnfs_inval_tracking *pos;
+        dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
+        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+                if (pos->it_sector > s)
+                        continue;
+                else if (pos->it_sector == s) {
+                        found = 1;
+                        break;
+                } else
+                        break;
+        }
+        if (found) {
+                pos->it_tags |= (1 << tag);
+                return 0;
+        } else {
+                struct pnfs_inval_tracking *new;
+                if (storage)
+                        new = storage;
+                else {
+                        new = kmalloc(sizeof(*new), GFP_NOFS);
+                        if (!new)
+                                return -ENOMEM;
+                }
+                new->it_sector = s;
+                new->it_tags = (1 << tag);
+                list_add(&new->it_link, &pos->it_link);
+                return 1;
+        }
+}
+/* XXXX Really want option to not create */
+/* Over range, unions tag with existing entries, else creates entry with tag */
+static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
+{
+        u64 i;
+        dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
+        for (i = normalize(s, tree->mtt_step_size); i < s + length;
+             i += tree->mtt_step_size)
+                if (_add_entry(tree, i, tag, NULL))
+                        return -ENOMEM;
+        return 0;
+}
+/* Ensure that future operations on given range of tree will not malloc */
+static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+{
+        u64 start, end, s;
+        int count, i, used = 0, status = -ENOMEM;
+        struct pnfs_inval_tracking **storage;
+        dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
+        start = normalize(offset, tree->mtt_step_size);
+        end = normalize_up(offset + length, tree->mtt_step_size);
+        count = (int)(end - start) / (int)tree->mtt_step_size;
+        /* Pre-malloc what memory we might need */
+        storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
+        if (!storage)
+                return -ENOMEM;
+        for (i = 0; i < count; i++) {
+                storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
+                                     GFP_NOFS);
+                if (!storage[i])
+                        goto out_cleanup;
+        }
+        /* Now need lock - HOW??? */
+        for (s = start; s < end; s += tree->mtt_step_size)
+                used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+        /* Unlock - HOW??? */
+        status = 0;
+ out_cleanup:
+        for (i = used; i < count; i++) {
+                if (!storage[i])
+                        break;
+                kfree(storage[i]);
+        }
+        kfree(storage);
+        return status;
+}
+static void set_needs_init(sector_t *array, sector_t offset)
+{
+        sector_t *p = array;
+        dprintk("%s enter\n", __func__);
+        if (!p)
+                return;
+        while (*p < offset)
+                p++;
+        if (*p == offset)
+                return;
+        else if (*p == ~0) {
+                *p++ = offset;
+                *p = ~0;
+                return;
+        } else {
+                sector_t *save = p;
+                dprintk("%s Adding %llu\n", __func__, (u64)offset);
+                while (*p != ~0)
+                        p++;
+                p++;
+                memmove(save + 1, save, (char *)p - (char *)save);
+                *save = offset;
+                return;
+        }
+}
+/* We are relying on page lock to serialize this */
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
+{
+        int rv;
+        spin_lock(&marks->im_lock);
+        rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
+        spin_unlock(&marks->im_lock);
+        return rv;
+}
+/* Assume start, end already sector aligned */
+static int
+_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
+{
+        struct pnfs_inval_tracking *pos;
+        u64 expect = 0;
+        dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
+        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+                if (pos->it_sector >= end)
+                        continue;
+                if (!expect) {
+                        if ((pos->it_sector == end - tree->mtt_step_size) &&
+                            (pos->it_tags & (1 << tag))) {
+                                expect = pos->it_sector - tree->mtt_step_size;
+                                if (pos->it_sector < tree->mtt_step_size || expect < start)
+                                        return 1;
+                                continue;
+                        } else {
+                                return 0;
+                        }
+                }
+                if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
+                        return 0;
+                expect -= tree->mtt_step_size;
+                if (expect < start)
+                        return 1;
+        }
+        return 0;
+}
+static int is_range_written(struct pnfs_inval_markings *marks,
+                            sector_t start, sector_t end)
+{
+        int rv;
+        spin_lock(&marks->im_lock);
+        rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
+        spin_unlock(&marks->im_lock);
+        return rv;
+}
+/* Marks sectors in [offest, offset_length) as having been initialized.
+ * All lengths are step-aligned, where step is min(pagesize, blocksize).
+ * Notes where partial block is initialized, and helps prepare it for
+ * complete initialization later.
+ */
+/* Currently assumes offset is page-aligned */
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+                             sector_t offset, sector_t length,
+                             sector_t **pages)
+{
+        sector_t s, start, end;
+        sector_t *array = NULL; /* Pages to mark */
+        dprintk("%s(offset=%llu,len=%llu) enter\n",
+                __func__, (u64)offset, (u64)length);
+        s = max((sector_t) 3,
+                2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
+        dprintk("%s set max=%llu\n", __func__, (u64)s);
+        if (pages) {
+                array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
+                if (!array)
+                        goto outerr;
+                array[0] = ~0;
+        }
+        start = normalize(offset, marks->im_block_size);
+        end = normalize_up(offset + length, marks->im_block_size);
+        if (_preload_range(&marks->im_tree, start, end - start))
+                goto outerr;
+        spin_lock(&marks->im_lock);
+        for (s = normalize_up(start, PAGE_CACHE_SECTORS);
+             s < offset; s += PAGE_CACHE_SECTORS) {
+                dprintk("%s pre-area pages\n", __func__);
+                /* Portion of used block is not initialized */
+                if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+                        set_needs_init(array, s);
+        }
+        if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
+                goto out_unlock;
+        for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
+             s < end; s += PAGE_CACHE_SECTORS) {
+                dprintk("%s post-area pages\n", __func__);
+                if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+                        set_needs_init(array, s);
+        }
+        spin_unlock(&marks->im_lock);
+        if (pages) {
+                if (array[0] == ~0) {
+                        kfree(array);
+                        *pages = NULL;
+                } else
+                        *pages = array;
+        }
+        return 0;
+ out_unlock:
+        spin_unlock(&marks->im_lock);
+ outerr:
+        if (pages) {
+                kfree(array);
+                *pages = NULL;
+        }
+        return -ENOMEM;
+}
+/* Marks sectors in [offest, offset+length) as having been written to disk.
+ * All lengths should be block aligned.
+ */
+static int mark_written_sectors(struct pnfs_inval_markings *marks,
+                                sector_t offset, sector_t length)
+{
+        int status;
+        dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
+                (u64)offset, (u64)length);
+        spin_lock(&marks->im_lock);
+        status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
+        spin_unlock(&marks->im_lock);
+        return status;
+}
+static void print_short_extent(struct pnfs_block_short_extent *be)
+{
+        dprintk("PRINT SHORT EXTENT extent %p\n", be);
+        if (be) {
+                dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset);
+                dprintk("        be_length   %llu\n", (u64)be->bse_length);
+        }
+}
+static void print_clist(struct list_head *list, unsigned int count)
+{
+        struct pnfs_block_short_extent *be;
+        unsigned int i = 0;
+        ifdebug(FACILITY) {
+                printk(KERN_DEBUG "****************\n");
+                printk(KERN_DEBUG "Extent list looks like:\n");
+                list_for_each_entry(be, list, bse_node) {
+                        i++;
+                        print_short_extent(be);
+                }
+                if (i != count)
+                        printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
+                printk(KERN_DEBUG "****************\n");
+        }
+}
+/* Note: In theory, we should do more checking that devid's match between
+ * old and new, but if they don't, the lists are too corrupt to salvage anyway.
+ */
+/* Note this is very similar to bl_add_merge_extent */
+static void add_to_commitlist(struct pnfs_block_layout *bl,
+                              struct pnfs_block_short_extent *new)
+{
+        struct list_head *clist = &bl->bl_commit;
+        struct pnfs_block_short_extent *old, *save;
+        sector_t end = new->bse_f_offset + new->bse_length;
+        dprintk("%s enter\n", __func__);
+        print_short_extent(new);
+        print_clist(clist, bl->bl_count);
+        bl->bl_count++;
+        /* Scan for proper place to insert, extending new to the left
+         * as much as possible.
+         */
+        list_for_each_entry_safe(old, save, clist, bse_node) {
+                if (new->bse_f_offset < old->bse_f_offset)
+                        break;
+                if (end <= old->bse_f_offset + old->bse_length) {
+                        /* Range is already in list */
+                        bl->bl_count--;
+                        kfree(new);
+                        return;
+                } else if (new->bse_f_offset <=
+                                old->bse_f_offset + old->bse_length) {
+                        /* new overlaps or abuts existing be */
+                        if (new->bse_mdev == old->bse_mdev) {
+                                /* extend new to fully replace old */
+                                new->bse_length += new->bse_f_offset -
+                                                old->bse_f_offset;
+                                new->bse_f_offset = old->bse_f_offset;
+                                list_del(&old->bse_node);
+                                bl->bl_count--;
+                                kfree(old);
+                        }
+                }
+        }
+        /* Note that if we never hit the above break, old will not point to a
+         * valid extent.  However, in that case &old->bse_node==list.
+         */
+        list_add_tail(&new->bse_node, &old->bse_node);
+        /* Scan forward for overlaps.  If we find any, extend new and
+         * remove the overlapped extent.
+         */
+        old = list_prepare_entry(new, clist, bse_node);
+        list_for_each_entry_safe_continue(old, save, clist, bse_node) {
+                if (end < old->bse_f_offset)
+                        break;
+                /* new overlaps or abuts old */
+                if (new->bse_mdev == old->bse_mdev) {
+                        if (end < old->bse_f_offset + old->bse_length) {
+                                /* extend new to fully cover old */
+                                end = old->bse_f_offset + old->bse_length;
+                                new->bse_length = end - new->bse_f_offset;
+                        }
+                        list_del(&old->bse_node);
+                        bl->bl_count--;
+                        kfree(old);
+                }
+        }
+        dprintk("%s: after merging\n", __func__);
+        print_clist(clist, bl->bl_count);
+}
+/* Note the range described by offset, length is guaranteed to be contained
+ * within be.
+ */
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+                    sector_t offset, sector_t length)
+{
+        sector_t new_end, end = offset + length;
+        struct pnfs_block_short_extent *new;
+        struct pnfs_block_layout *bl = container_of(be->be_inval,
+                                                    struct pnfs_block_layout,
+                                                    bl_inval);
+        new = kmalloc(sizeof(*new), GFP_NOFS);
+        if (!new)
+                return -ENOMEM;
+        mark_written_sectors(be->be_inval, offset, length);
+        /* We want to add the range to commit list, but it must be
+         * block-normalized, and verified that the normalized range has
+         * been entirely written to disk.
+         */
+        new->bse_f_offset = offset;
+        offset = normalize(offset, bl->bl_blocksize);
+        if (offset < new->bse_f_offset) {
+                if (is_range_written(be->be_inval, offset, new->bse_f_offset))
+                        new->bse_f_offset = offset;
+                else
+                        new->bse_f_offset = offset + bl->bl_blocksize;
+        }
+        new_end = normalize_up(end, bl->bl_blocksize);
+        if (end < new_end) {
+                if (is_range_written(be->be_inval, end, new_end))
+                        end = new_end;
+                else
+                        end = new_end - bl->bl_blocksize;
+        }
+        if (end <= new->bse_f_offset) {
+                kfree(new);
+                return 0;
+        }
+        new->bse_length = end - new->bse_f_offset;
+        new->bse_devid = be->be_devid;
+        new->bse_mdev = be->be_mdev;
+        spin_lock(&bl->bl_ext_lock);
+        /* new will be freed, either by add_to_commitlist if it decides not
+         * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
+         */
+        add_to_commitlist(bl, new);
+        spin_unlock(&bl->bl_ext_lock);
+        return 0;
+}
+static void print_bl_extent(struct pnfs_block_extent *be)
+{
+        dprintk("PRINT EXTENT extent %p\n", be);
+        if (be) {
+                dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset);
+                dprintk("        be_length   %llu\n", (u64)be->be_length);
+                dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset);
+                dprintk("        be_state    %d\n", be->be_state);
+        }
+}
+static void
+destroy_extent(struct kref *kref)
+{
+        struct pnfs_block_extent *be;
+        be = container_of(kref, struct pnfs_block_extent, be_refcnt);
+        dprintk("%s be=%p\n", __func__, be);
+        kfree(be);
+}
+void
+bl_put_extent(struct pnfs_block_extent *be)
+{
+        if (be) {
+                dprintk("%s enter %p (%i)\n", __func__, be,
+                        atomic_read(&be->be_refcnt.refcount));
+                kref_put(&be->be_refcnt, destroy_extent);
+        }
+}
+struct pnfs_block_extent *bl_alloc_extent(void)
+{
+        struct pnfs_block_extent *be;
+        be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
+        if (!be)
+                return NULL;
+        INIT_LIST_HEAD(&be->be_node);
+        kref_init(&be->be_refcnt);
+        be->be_inval = NULL;
+        return be;
+}
+static void print_elist(struct list_head *list)
+{
+        struct pnfs_block_extent *be;
+        dprintk("****************\n");
+        dprintk("Extent list looks like:\n");
+        list_for_each_entry(be, list, be_node) {
+                print_bl_extent(be);
+        }
+        dprintk("****************\n");
+}
+static inline int
+extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
+{
+        /* Note this assumes new->be_f_offset >= old->be_f_offset */
+        return (new->be_state == old->be_state) &&
+                ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
+                 ((new->be_v_offset - old->be_v_offset ==
+                   new->be_f_offset - old->be_f_offset) &&
+                  new->be_mdev == old->be_mdev));
+}
+/* Adds new to appropriate list in bl, modifying new and removing existing
+ * extents as appropriate to deal with overlaps.
+ *
+ * See bl_find_get_extent for list constraints.
+ *
+ * Refcount on new is already set.  If end up not using it, or error out,
+ * need to put the reference.
+ *
+ * bl->bl_ext_lock is held by caller.
+ */
+int
+bl_add_merge_extent(struct pnfs_block_layout *bl,
+                     struct pnfs_block_extent *new)
+{
+        struct pnfs_block_extent *be, *tmp;
+        sector_t end = new->be_f_offset + new->be_length;
+        struct list_head *list;
+        dprintk("%s enter with be=%p\n", __func__, new);
+        print_bl_extent(new);
+        list = &bl->bl_extents[bl_choose_list(new->be_state)];
+        print_elist(list);
+        /* Scan for proper place to insert, extending new to the left
+         * as much as possible.
+         */
+        list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
+                if (new->be_f_offset >= be->be_f_offset + be->be_length)
+                        break;
+                if (new->be_f_offset >= be->be_f_offset) {
+                        if (end <= be->be_f_offset + be->be_length) {
+                                /* new is a subset of existing be*/
+                                if (extents_consistent(be, new)) {
+                                        dprintk("%s: new is subset, ignoring\n",
+                                                __func__);
+                                        bl_put_extent(new);
+                                        return 0;
+                                } else {
+                                        goto out_err;
+                                }
+                        } else {
+                                /* |<--   be   -->|
+                                 *          |<--   new   -->| */
+                                if (extents_consistent(be, new)) {
+                                        /* extend new to fully replace be */
+                                        new->be_length += new->be_f_offset -
+                                                be->be_f_offset;
+                                        new->be_f_offset = be->be_f_offset;
+                                        new->be_v_offset = be->be_v_offset;
+                                        dprintk("%s: removing %p\n", __func__, be);
+                                        list_del(&be->be_node);
+                                        bl_put_extent(be);
+                                } else {
+                                        goto out_err;
+                                }
+                        }
+                } else if (end >= be->be_f_offset + be->be_length) {
+                        /* new extent overlap existing be */
+                        if (extents_consistent(be, new)) {
+                                /* extend new to fully replace be */
+                                dprintk("%s: removing %p\n", __func__, be);
+                                list_del(&be->be_node);
+                                bl_put_extent(be);
+                        } else {
+                                goto out_err;
+                        }
+                } else if (end > be->be_f_offset) {
+                        /*           |<--   be   -->|
+                         *|<--   new   -->| */
+                        if (extents_consistent(new, be)) {
+                                /* extend new to fully replace be */
+                                new->be_length += be->be_f_offset + be->be_length -
+                                        new->be_f_offset - new->be_length;
+                                dprintk("%s: removing %p\n", __func__, be);
+                                list_del(&be->be_node);
+                                bl_put_extent(be);
+                        } else {
+                                goto out_err;
+                        }
+                }
+        }
+        /* Note that if we never hit the above break, be will not point to a
+         * valid extent.  However, in that case &be->be_node==list.
+         */
+        list_add(&new->be_node, &be->be_node);
+        dprintk("%s: inserting new\n", __func__);
+        print_elist(list);
+        /* FIXME - The per-list consistency checks have all been done,
+         * should now check cross-list consistency.
+         */
+        return 0;
+ out_err:
+        bl_put_extent(new);
+        return -EIO;
+}
+/* Returns extent, or NULL.  If a second READ extent exists, it is returned
+ * in cow_read, if given.
+ *
+ * The extents are kept in two seperate ordered lists, one for READ and NONE,
+ * one for READWRITE and INVALID.  Within each list, we assume:
+ * 1. Extents are ordered by file offset.
+ * 2. For any given isect, there is at most one extents that matches.
+ */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+            struct pnfs_block_extent **cow_read)
+{
+        struct pnfs_block_extent *be, *cow, *ret;
+        int i;
+        dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+        cow = ret = NULL;
+        spin_lock(&bl->bl_ext_lock);
+        for (i = 0; i < EXTENT_LISTS; i++) {
+                list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+                        if (isect >= be->be_f_offset + be->be_length)
+                                break;
+                        if (isect >= be->be_f_offset) {
+                                /* We have found an extent */
+                                dprintk("%s Get %p (%i)\n", __func__, be,
+                                        atomic_read(&be->be_refcnt.refcount));
+                                kref_get(&be->be_refcnt);
+                                if (!ret)
+                                        ret = be;
+                                else if (be->be_state != PNFS_BLOCK_READ_DATA)
+                                        bl_put_extent(be);
+                                else
+                                        cow = be;
+                                break;
+                        }
+                }
+                if (ret &&
+                    (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
+                        break;
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        if (cow_read)
+                *cow_read = cow;
+        print_bl_extent(ret);
+        return ret;
+}
+/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
+static struct pnfs_block_extent *
+bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
+{
+        struct pnfs_block_extent *be, *ret = NULL;
+        int i;
+        dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+        for (i = 0; i < EXTENT_LISTS; i++) {
+                if (ret)
+                        break;
+                list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+                        if (isect >= be->be_f_offset + be->be_length)
+                                break;
+                        if (isect >= be->be_f_offset) {
+                                /* We have found an extent */
+                                dprintk("%s Get %p (%i)\n", __func__, be,
+                                        atomic_read(&be->be_refcnt.refcount));
+                                kref_get(&be->be_refcnt);
+                                ret = be;
+                                break;
+                        }
+                }
+        }
+        print_bl_extent(ret);
+        return ret;
+}
+int
+encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                               struct xdr_stream *xdr,
+                               const struct nfs4_layoutcommit_args *arg)
+{
+        struct pnfs_block_short_extent *lce, *save;
+        unsigned int count = 0;
+        __be32 *p, *xdr_start;
+        dprintk("%s enter\n", __func__);
+        /* BUG - creation of bl_commit is buggy - need to wait for
+         * entire block to be marked WRITTEN before it can be added.
+         */
+        spin_lock(&bl->bl_ext_lock);
+        /* Want to adjust for possible truncate */
+        /* We now want to adjust argument range */
+        /* XDR encode the ranges found */
+        xdr_start = xdr_reserve_space(xdr, 8);
+        if (!xdr_start)
+                goto out;
+        list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
+                p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
+                if (!p)
+                        break;
+                p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
+                p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
+                p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
+                p = xdr_encode_hyper(p, 0LL);
+                *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+                list_del(&lce->bse_node);
+                list_add_tail(&lce->bse_node, &bl->bl_committing);
+                bl->bl_count--;
+                count++;
+        }
+        xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
+        xdr_start[1] = cpu_to_be32(count);
+out:
+        spin_unlock(&bl->bl_ext_lock);
+        dprintk("%s found %i ranges\n", __func__, count);
+        return 0;
+}
+/* Helper function to set_to_rw that initialize a new extent */
+static void
+_prep_new_extent(struct pnfs_block_extent *new,
+                 struct pnfs_block_extent *orig,
+                 sector_t offset, sector_t length, int state)
+{
+        kref_init(&new->be_refcnt);
+        /* don't need to INIT_LIST_HEAD(&new->be_node) */
+        memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
+        new->be_mdev = orig->be_mdev;
+        new->be_f_offset = offset;
+        new->be_length = length;
+        new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
+        new->be_state = state;
+        new->be_inval = orig->be_inval;
+}
+/* Tries to merge be with extent in front of it in list.
+ * Frees storage if not used.
+ */
+static struct pnfs_block_extent *
+_front_merge(struct pnfs_block_extent *be, struct list_head *head,
+             struct pnfs_block_extent *storage)
+{
+        struct pnfs_block_extent *prev;
+        if (!storage)
+                goto no_merge;
+        if (&be->be_node == head || be->be_node.prev == head)
+                goto no_merge;
+        prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
+        if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
+            !extents_consistent(prev, be))
+                goto no_merge;
+        _prep_new_extent(storage, prev, prev->be_f_offset,
+                         prev->be_length + be->be_length, prev->be_state);
+        list_replace(&prev->be_node, &storage->be_node);
+        bl_put_extent(prev);
+        list_del(&be->be_node);
+        bl_put_extent(be);
+        return storage;
+ no_merge:
+        kfree(storage);
+        return be;
+}
+static u64
+set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
+{
+        u64 rv = offset + length;
+        struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
+        struct pnfs_block_extent *children[3];
+        struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
+        int i = 0, j;
+        dprintk("%s(%llu, %llu)\n", __func__, offset, length);
+        /* Create storage for up to three new extents e1, e2, e3 */
+        e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
+        e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
+        e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
+        /* BUG - we are ignoring any failure */
+        if (!e1 || !e2 || !e3)
+                goto out_nosplit;
+        spin_lock(&bl->bl_ext_lock);
+        be = bl_find_get_extent_locked(bl, offset);
+        rv = be->be_f_offset + be->be_length;
+        if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
+                spin_unlock(&bl->bl_ext_lock);
+                goto out_nosplit;
+        }
+        /* Add e* to children, bumping e*'s krefs */
+        if (be->be_f_offset != offset) {
+                _prep_new_extent(e1, be, be->be_f_offset,
+                                 offset - be->be_f_offset,
+                                 PNFS_BLOCK_INVALID_DATA);
+                children[i++] = e1;
+                print_bl_extent(e1);
+        } else
+                merge1 = e1;
+        _prep_new_extent(e2, be, offset,
+                         min(length, be->be_f_offset + be->be_length - offset),
+                         PNFS_BLOCK_READWRITE_DATA);
+        children[i++] = e2;
+        print_bl_extent(e2);
+        if (offset + length < be->be_f_offset + be->be_length) {
+                _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
+                                 be->be_f_offset + be->be_length -
+                                 offset - length,
+                                 PNFS_BLOCK_INVALID_DATA);
+                children[i++] = e3;
+                print_bl_extent(e3);
+        } else
+                merge2 = e3;
+        /* Remove be from list, and insert the e* */
+        /* We don't get refs on e*, since this list is the base reference
+         * set when init'ed.
+         */
+        if (i < 3)
+                children[i] = NULL;
+        new = children[0];
+        list_replace(&be->be_node, &new->be_node);
+        bl_put_extent(be);
+        new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
+        for (j = 1; j < i; j++) {
+                old = new;
+                new = children[j];
+                list_add(&new->be_node, &old->be_node);
+        }
+        if (merge2) {
+                /* This is a HACK, should just create a _back_merge function */
+                new = list_entry(new->be_node.next,
+                                 struct pnfs_block_extent, be_node);
+                new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        /* Since we removed the base reference above, be is now scheduled for
+         * destruction.
+         */
+        bl_put_extent(be);
+        dprintk("%s returns %llu after split\n", __func__, rv);
+        return rv;
+ out_nosplit:
+        kfree(e1);
+        kfree(e2);
+        kfree(e3);
+        dprintk("%s returns %llu without splitting\n", __func__, rv);
+        return rv;
+}
+void
+clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                              const struct nfs4_layoutcommit_args *arg,
+                              int status)
+{
+        struct pnfs_block_short_extent *lce, *save;
+        dprintk("%s status %d\n", __func__, status);
+        list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
+                if (likely(!status)) {
+                        u64 offset = lce->bse_f_offset;
+                        u64 end = offset + lce->bse_length;
+                        do {
+                                offset = set_to_rw(bl, offset, end - offset);
+                        } while (offset < end);
+                        list_del(&lce->bse_node);
+                        kfree(lce);
+                } else {
+                        list_del(&lce->bse_node);
+                        spin_lock(&bl->bl_ext_lock);
+                        add_to_commitlist(bl, lce);
+                        spin_unlock(&bl->bl_ext_lock);
+                }
+        }
+}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 19ea7d9c75e..5833fbbf59b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = {
        .nrvers                 = ARRAY_SIZE(nfs_version),
        .version                = nfs_version,
        .stats                  = &nfs_rpcstat,
-        .pipe_dir_name          = "/nfs",
+        .pipe_dir_name          = NFS_PIPE_DIRNAME,
 };
 struct rpc_stat nfs_rpcstat = {
@@ -904,7 +904,9 @@ error:
 /*
 * Load up the server record from information gained in an fsinfo record
 */
-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+                                  struct nfs_fh *mntfh,
+                                  struct nfs_fsinfo *fsinfo)
 {
        unsigned long max_rpc_payload;
@@ -934,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        set_pnfs_layoutdriver(server, fsinfo->layouttype);
+        server->pnfs_blksize = fsinfo->blksize;
+        set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
@@ -980,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        if (error < 0)
                goto out_error;
-        nfs_server_set_fsinfo(server, &fsinfo);
+        nfs_server_set_fsinfo(server, mntfh, &fsinfo);
        /* Get some general file system info */
        if (server->namelen == 0) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57f578e2560..b238d95ac48 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = {
 #endif /* CONFIG_NFS_V4 */
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
 {
        struct nfs_open_dir_context *ctx;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (ctx != NULL) {
                ctx->duped = 0;
+                ctx->attr_gencount = NFS_I(dir)->attr_gencount;
                ctx->dir_cookie = 0;
                ctx->dup_cookie = 0;
                ctx->cred = get_rpccred(cred);
-        } else
+                return ctx;
-                ctx = ERR_PTR(-ENOMEM);
+        }
-        return ctx;
+        return  ERR_PTR(-ENOMEM);
 }
 static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
@@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        ctx = alloc_nfs_open_dir_context(cred);
+        ctx = alloc_nfs_open_dir_context(inode, cred);
        if (IS_ERR(ctx)) {
                res = PTR_ERR(ctx);
                goto out;
@@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 {
        loff_t diff = desc->file->f_pos - desc->current_index;
        unsigned int index;
-        struct nfs_open_dir_context *ctx = desc->file->private_data;
        if (diff < 0)
                goto out_eof;
@@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        index = (unsigned int)diff;
        *desc->dir_cookie = array->array[index].cookie;
        desc->cache_entry_index = index;
-        ctx->duped = 0;
        return 0;
 out_eof:
        desc->eof = 1;
@@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
        int i;
        loff_t new_pos;
        int status = -EAGAIN;
-        struct nfs_open_dir_context *ctx = desc->file->private_data;
        for (i = 0; i < array->size; i++) {
                if (array->array[i].cookie == *desc->dir_cookie) {
+                        struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
+                        struct nfs_open_dir_context *ctx = desc->file->private_data;
                        new_pos = desc->current_index + i;
-                        if (new_pos < desc->file->f_pos) {
+                        if (ctx->attr_gencount != nfsi->attr_gencount
+                            || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+                                ctx->duped = 0;
+                                ctx->attr_gencount = nfsi->attr_gencount;
+                        } else if (new_pos < desc->file->f_pos) {
+                                if (ctx->duped > 0
+                                    && ctx->dup_cookie == *desc->dir_cookie) {
+                                        if (printk_ratelimit()) {
+                                                pr_notice("NFS: directory %s/%s contains a readdir loop."
+                                                                "Please contact your server vendor.  "
+                                                                "The file: %s has duplicate cookie %llu\n",
+                                                                desc->file->f_dentry->d_parent->d_name.name,
+                                                                desc->file->f_dentry->d_name.name,
+                                                                array->array[i].string.name,
+                                                                *desc->dir_cookie);
+                                        }
+                                        status = -ELOOP;
+                                        goto out;
+                                }
                                ctx->dup_cookie = *desc->dir_cookie;
-                                ctx->duped = 1;
+                                ctx->duped = -1;
                        }
                        desc->file->f_pos = new_pos;
                        desc->cache_entry_index = i;
@@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                if (*desc->dir_cookie == array->last_cookie)
                        desc->eof = 1;
        }
+out:
        return status;
 }
@@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct nfs_cache_array *array = NULL;
        struct nfs_open_dir_context *ctx = file->private_data;
-        if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
-                if (printk_ratelimit()) {
-                        pr_notice("NFS: directory %s/%s contains a readdir loop.  "
-                                "Please contact your server vendor.  "
-                                "Offending cookie: %llu\n",
-                                file->f_dentry->d_parent->d_name.name,
-                                file->f_dentry->d_name.name,
-                                *desc->dir_cookie);
-                }
-                res = -ELOOP;
-                goto out;
-        }
        array = nfs_readdir_get_array(desc->page);
        if (IS_ERR(array)) {
                res = PTR_ERR(array);
@@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
                        *desc->dir_cookie = array->last_cookie;
+                if (ctx->duped != 0)
+                        ctx->duped = 1;
        }
        if (array->eof_index >= 0)
                desc->eof = 1;
@@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct page     *page = NULL;
        int             status;
        struct inode *inode = desc->file->f_path.dentry->d_inode;
+        struct nfs_open_dir_context *ctx = desc->file->private_data;
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        desc->page_index = 0;
        desc->last_cookie = *desc->dir_cookie;
        desc->page = page;
+        ctx->duped = 0;
        status = nfs_readdir_xdr_to_array(desc, page, inode);
        if (status < 0)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1909ee8be35..1ec1a85fa71 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -318,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
 extern const u32 nfs4_pathconf_bitmap[2];
-extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fsinfo_bitmap[3];
 extern const u32 nfs4_fs_locations_bitmap[2];
 /* nfs4renewd.c */
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be93a622872..e8915d4840a 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
        pnfs_set_layoutcommit(wdata);
        dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
-                (unsigned long) wdata->lseg->pls_end_pos);
+                (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
 }
 /*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 079614deca3..8c77039e7a8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -140,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
        0
 };
-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD0_MAXREAD
                        | FATTR4_WORD0_MAXWRITE
                        | FATTR4_WORD0_LEASE_TIME,
                        FATTR4_WORD1_TIME_DELTA
-                        | FATTR4_WORD1_FS_LAYOUT_TYPES
+                        | FATTR4_WORD1_FS_LAYOUT_TYPES,
+                        FATTR4_WORD2_LAYOUT_BLKSIZE
 };
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -5834,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
        return status;
 }
+/*
+ * Retrieve the list of Data Server devices from the MDS.
+ */
+static int _nfs4_getdevicelist(struct nfs_server *server,
+                                    const struct nfs_fh *fh,
+                                    struct pnfs_devicelist *devlist)
+{
+        struct nfs4_getdevicelist_args args = {
+                .fh = fh,
+                .layoutclass = server->pnfs_curr_ld->id,
+        };
+        struct nfs4_getdevicelist_res res = {
+                .devlist = devlist,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        int status;
+        dprintk("--> %s\n", __func__);
+        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+                                &res.seq_res, 0);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+int nfs4_proc_getdevicelist(struct nfs_server *server,
+                            const struct nfs_fh *fh,
+                            struct pnfs_devicelist *devlist)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                _nfs4_getdevicelist(server, fh, devlist),
+                                &exception);
+        } while (exception.retry);
+        dprintk("%s: err=%d, num_devs=%u\n", __func__,
+                err, devlist->num_devs);
+        return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 {
@@ -5912,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 static void nfs4_layoutcommit_release(void *calldata)
 {
        struct nfs4_layoutcommit_data *data = calldata;
+        struct pnfs_layout_segment *lseg, *tmp;
+        pnfs_cleanup_layoutcommit(data);
        /* Matched by references in pnfs_set_layoutcommit */
-        put_lseg(data->lseg);
+        list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
+                list_del_init(&lseg->pls_lc_list);
+                if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
+                                       &lseg->pls_flags))
+                        put_lseg(lseg);
+        }
        put_rpccred(data->cred);
        kfree(data);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c191a9baa42..1dce12f41a4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
 #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
 #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
 #define encode_fsinfo_maxsz     (encode_getattr_maxsz)
-#define decode_fsinfo_maxsz     (op_decode_hdr_maxsz + 15)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz     (op_decode_hdr_maxsz + \
+                                 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
 #define encode_renew_maxsz      (op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz      (op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
@@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz   (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz   (op_decode_hdr_maxsz + 4)
+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
+                                encode_verifier_maxsz)
+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
+                                2 /* nfs_cookie4 gdlr_cookie */ + \
+                                decode_verifier_maxsz \
+                                  /* verifier4 gdlr_verifier */ + \
+                                1 /* gdlr_deviceid_list count */ + \
+                                XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
+                                            NFS4_DEVICEID4_SIZE) \
+                                  /* gdlr_deviceid_list */ + \
+                                1 /* bool gdlr_eof */)
 #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -748,6 +763,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz    (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
+                                encode_putfh_maxsz + \
+                                encode_getdevicelist_maxsz)
+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz + \
+                                decode_getdevicelist_maxsz)
 #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
                                encode_sequence_maxsz +\
                                encode_getdeviceinfo_maxsz)
@@ -1104,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
        hdr->replen += decode_getattr_maxsz;
 }
+static void
+encode_getattr_three(struct xdr_stream *xdr,
+                     uint32_t bm0, uint32_t bm1, uint32_t bm2,
+                     struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 4);
+        *p = cpu_to_be32(OP_GETATTR);
+        if (bm2) {
+                p = reserve_space(xdr, 16);
+                *p++ = cpu_to_be32(3);
+                *p++ = cpu_to_be32(bm0);
+                *p++ = cpu_to_be32(bm1);
+                *p = cpu_to_be32(bm2);
+        } else if (bm1) {
+                p = reserve_space(xdr, 12);
+                *p++ = cpu_to_be32(2);
+                *p++ = cpu_to_be32(bm0);
+                *p = cpu_to_be32(bm1);
+        } else {
+                p = reserve_space(xdr, 8);
+                *p++ = cpu_to_be32(1);
+                *p = cpu_to_be32(bm0);
+        }
+        hdr->nops++;
+        hdr->replen += decode_getattr_maxsz;
+}
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
        encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1112,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
 static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+        encode_getattr_three(xdr,
-                           bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
+                             bitmask[0] & nfs4_fsinfo_bitmap[0],
+                             bitmask[1] & nfs4_fsinfo_bitmap[1],
+                             bitmask[2] & nfs4_fsinfo_bitmap[2],
+                             hdr);
 }
 static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1855,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr,
 #ifdef CONFIG_NFS_V4_1
 static void
+encode_getdevicelist(struct xdr_stream *xdr,
+                     const struct nfs4_getdevicelist_args *args,
+                     struct compound_hdr *hdr)
+{
+        __be32 *p;
+        nfs4_verifier dummy = {
+                .data = "dummmmmy",
+        };
+        p = reserve_space(xdr, 20);
+        *p++ = cpu_to_be32(OP_GETDEVICELIST);
+        *p++ = cpu_to_be32(args->layoutclass);
+        *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
+        xdr_encode_hyper(p, 0ULL);                          /* cookie */
+        encode_nfs4_verifier(xdr, &dummy);
+        hdr->nops++;
+        hdr->replen += decode_getdevicelist_maxsz;
+}
+static void
 encode_getdeviceinfo(struct xdr_stream *xdr,
                     const struct nfs4_getdeviceinfo_args *args,
                     struct compound_hdr *hdr)
@@ -1916,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
        /* Only whole file layouts */
        p = xdr_encode_hyper(p, 0); /* offset */
-        p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
+        p = xdr_encode_hyper(p, args->lastbytewritten + 1);     /* length */
        *p++ = cpu_to_be32(0); /* reclaim */
        p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
        *p++ = cpu_to_be32(1); /* newoffset = TRUE */
@@ -2604,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+        const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
        encode_compound_hdr(xdr, req, &hdr);
        encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2748,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
-        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+        const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
        encode_compound_hdr(xdr, req, &hdr);
        encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -2775,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
 }
 /*
+ * Encode GETDEVICELIST request
+ */
+static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
+                                       struct xdr_stream *xdr,
+                                       struct nfs4_getdevicelist_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
+        encode_getdevicelist(xdr, args, &hdr);
+        encode_nops(&hdr);
+}
+/*
 * Encode GETDEVICEINFO request
 */
 static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -3011,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
                goto out_overflow;
        bmlen = be32_to_cpup(p);
-        bitmap[0] = bitmap[1] = 0;
+        bitmap[0] = bitmap[1] = bitmap[2] = 0;
        p = xdr_inline_decode(xdr, (bmlen << 2));
        if (unlikely(!p))
                goto out_overflow;
        if (bmlen > 0) {
                bitmap[0] = be32_to_cpup(p++);
-                if (bmlen > 1)
+                if (bmlen > 1) {
-                        bitmap[1] = be32_to_cpup(p);
+                        bitmap[1] = be32_to_cpup(p++);
+                        if (bmlen > 2)
+                                bitmap[2] = be32_to_cpup(p);
+                }
        }
        return 0;
 out_overflow:
@@ -3050,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
                        return ret;
                bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
        } else
-                bitmask[0] = bitmask[1] = 0;
+                bitmask[0] = bitmask[1] = bitmask[2] = 0;
-        dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
+        dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+                bitmask[0], bitmask[1], bitmask[2]);
        return 0;
 }
@@ -4105,7 +4202,7 @@ out_overflow:
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2] = {0};
+        uint32_t attrlen, bitmap[3] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4131,7 +4228,7 @@ xdr_error:
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2] = {0};
+        uint32_t attrlen, bitmap[3] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4163,7 +4260,7 @@ xdr_error:
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2] = {0};
+        uint32_t attrlen, bitmap[3] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4303,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
 {
        __be32 *savep;
        uint32_t attrlen,
-                 bitmap[2] = {0};
+                 bitmap[3] = {0};
        int status;
        status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4389,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
        return status;
 }
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+                                      uint32_t *res)
+{
+        __be32 *p;
+        dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+        *res = 0;
+        if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p)) {
+                        print_overflow_msg(__func__, xdr);
+                        return -EIO;
+                }
+                *res = be32_to_cpup(p);
+                bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+        }
+        return 0;
+}
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2];
+        uint32_t attrlen, bitmap[3];
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4420,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
        if (status != 0)
                goto xdr_error;
+        status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+        if (status)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
@@ -4839,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 {
        __be32 *savep;
        uint32_t attrlen,
-                 bitmap[2] = {0};
+                 bitmap[3] = {0};
        struct kvec *iov = req->rq_rcv_buf.head;
        int status;
@@ -5268,6 +5390,53 @@ out_overflow:
 }
 #if defined(CONFIG_NFS_V4_1)
+/*
+ * TODO: Need to handle case when EOF != true;
+ */
+static int decode_getdevicelist(struct xdr_stream *xdr,
+                                struct pnfs_devicelist *res)
+{
+        __be32 *p;
+        int status, i;
+        struct nfs_writeverf verftemp;
+        status = decode_op_hdr(xdr, OP_GETDEVICELIST);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 8 + 8 + 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        /* TODO: Skip cookie for now */
+        p += 2;
+        /* Read verifier */
+        p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+        res->num_devs = be32_to_cpup(p);
+        dprintk("%s: num_dev %d\n", __func__, res->num_devs);
+        if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
+                printk(KERN_ERR "%s too many result dev_num %u\n",
+                                __func__, res->num_devs);
+                return -EIO;
+        }
+        p = xdr_inline_decode(xdr,
+                              res->num_devs * NFS4_DEVICEID4_SIZE + 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        for (i = 0; i < res->num_devs; i++)
+                p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
+                                            NFS4_DEVICEID4_SIZE);
+        res->eof = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
                                struct pnfs_device *pdev)
@@ -5430,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
        int status;
        status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+        res->status = status;
        if (status)
                return status;
@@ -6542,6 +6712,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
 }
 /*
+ * Decode GETDEVICELIST response
+ */
+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_getdevicelist_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        dprintk("encoding getdevicelist!\n");
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status != 0)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status != 0)
+                goto out;
+        status = decode_putfh(xdr);
+        if (status != 0)
+                goto out;
+        status = decode_getdevicelist(xdr, res->devlist);
+out:
+        return status;
+}
+/*
 * Decode GETDEVINFO response
 */
 static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -6722,7 +6918,7 @@ out:
 int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                       int plus)
 {
-        uint32_t bitmap[2] = {0};
+        uint32_t bitmap[3] = {0};
        uint32_t len;
        __be32 *p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
@@ -6908,6 +7104,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
        PROC(SECINFO_NO_NAME,   enc_secinfo_no_name,    dec_secinfo_no_name),
        PROC(TEST_STATEID,      enc_test_stateid,       dec_test_stateid),
        PROC(FREE_STATEID,      enc_free_stateid,       dec_free_stateid),
+        PROC(GETDEVICELIST,     enc_getdevicelist,      dec_getdevicelist),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 38e5508555c..e550e8836c3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -76,8 +76,11 @@ find_pnfs_driver(u32 id)
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
-        if (nfss->pnfs_curr_ld)
+        if (nfss->pnfs_curr_ld) {
+                if (nfss->pnfs_curr_ld->clear_layoutdriver)
+                        nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
                module_put(nfss->pnfs_curr_ld->owner);
+        }
        nfss->pnfs_curr_ld = NULL;
 }
@@ -88,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
 */
 void
-set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+                      u32 id)
 {
        struct pnfs_layoutdriver_type *ld_type = NULL;
@@ -115,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
                goto out_no_driver;
        }
        server->pnfs_curr_ld = ld_type;
+        if (ld_type->set_layoutdriver
+            && ld_type->set_layoutdriver(server, mntfh)) {
+                printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
+                                __func__, id);
+                module_put(ld_type->owner);
+                goto out_no_driver;
+        }
        dprintk("%s: pNFS module for %u set\n", __func__, id);
        return;
@@ -190,6 +201,7 @@ static void
 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+        put_rpccred(lo->plh_lc_cred);
        return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
 }
@@ -224,6 +236,7 @@ static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
        INIT_LIST_HEAD(&lseg->pls_list);
+        INIT_LIST_HEAD(&lseg->pls_lc_list);
        atomic_set(&lseg->pls_refcount, 1);
        smp_mb();
        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
@@ -816,7 +829,9 @@ out:
 }
 static struct pnfs_layout_hdr *
-alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+alloc_init_layout_hdr(struct inode *ino,
+                      struct nfs_open_context *ctx,
+                      gfp_t gfp_flags)
 {
        struct pnfs_layout_hdr *lo;
@@ -828,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
        INIT_LIST_HEAD(&lo->plh_segs);
        INIT_LIST_HEAD(&lo->plh_bulk_recall);
        lo->plh_inode = ino;
+        lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
        return lo;
 }
 static struct pnfs_layout_hdr *
-pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
+pnfs_find_alloc_layout(struct inode *ino,
+                       struct nfs_open_context *ctx,
+                       gfp_t gfp_flags)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
        struct pnfs_layout_hdr *new = NULL;
@@ -847,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
                        return nfsi->layout;
        }
        spin_unlock(&ino->i_lock);
-        new = alloc_init_layout_hdr(ino, gfp_flags);
+        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
        spin_lock(&ino->i_lock);
        if (likely(nfsi->layout == NULL))       /* Won the race? */
@@ -940,7 +958,7 @@ pnfs_update_layout(struct inode *ino,
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
                return NULL;
        spin_lock(&ino->i_lock);
-        lo = pnfs_find_alloc_layout(ino, gfp_flags);
+        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
                goto out_unlock;
@@ -1350,16 +1368,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
 /*
- * Currently there is only one (whole file) write lseg.
+ * There can be multiple RW segments.
 */
-static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 {
-        struct pnfs_layout_segment *lseg, *rv = NULL;
+        struct pnfs_layout_segment *lseg;
-        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
+        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
-                if (lseg->pls_range.iomode == IOMODE_RW)
+                if (lseg->pls_range.iomode == IOMODE_RW &&
-                        rv = lseg;
+                    test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
-        return rv;
+                        list_add(&lseg->pls_lc_list, listp);
+        }
 }
 void
@@ -1371,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
        spin_lock(&nfsi->vfs_inode.i_lock);
        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
-                /* references matched in nfs4_layoutcommit_release */
-                get_lseg(wdata->lseg);
-                wdata->lseg->pls_lc_cred =
-                        get_rpccred(wdata->args.context->state->owner->so_cred);
                mark_as_dirty = true;
                dprintk("%s: Set layoutcommit for inode %lu ",
                        __func__, wdata->inode->i_ino);
        }
-        if (end_pos > wdata->lseg->pls_end_pos)
+        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
-                wdata->lseg->pls_end_pos = end_pos;
+                /* references matched in nfs4_layoutcommit_release */
+                get_lseg(wdata->lseg);
+        }
+        if (end_pos > nfsi->layout->plh_lwb)
+                nfsi->layout->plh_lwb = end_pos;
        spin_unlock(&nfsi->vfs_inode.i_lock);
+        dprintk("%s: lseg %p end_pos %llu\n",
+                __func__, wdata->lseg, nfsi->layout->plh_lwb);
        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
@@ -1390,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
+{
+        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+        if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+                nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
+}
 /*
 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
@@ -1403,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        struct nfs4_layoutcommit_data *data;
        struct nfs_inode *nfsi = NFS_I(inode);
-        struct pnfs_layout_segment *lseg;
-        struct rpc_cred *cred;
        loff_t end_pos;
        int status = 0;
@@ -1421,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
                goto out;
        }
+        INIT_LIST_HEAD(&data->lseg_list);
        spin_lock(&inode->i_lock);
        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
                spin_unlock(&inode->i_lock);
                kfree(data);
                goto out;
        }
-        /*
-         * Currently only one (whole file) write lseg which is referenced
-         * in pnfs_set_layoutcommit and will be found.
-         */
-        lseg = pnfs_list_write_lseg(inode);
-        end_pos = lseg->pls_end_pos;
+        pnfs_list_write_lseg(inode, &data->lseg_list);
-        cred = lseg->pls_lc_cred;
-        lseg->pls_end_pos = 0;
+        end_pos = nfsi->layout->plh_lwb;
-        lseg->pls_lc_cred = NULL;
+        nfsi->layout->plh_lwb = 0;
        memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
                sizeof(nfsi->layout->plh_stateid.data));
        spin_unlock(&inode->i_lock);
        data->args.inode = inode;
-        data->lseg = lseg;
+        data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
-        data->cred = cred;
        nfs_fattr_init(&data->fattr);
        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
        data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 078670dfbe0..e0b5d80a43f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,16 +36,16 @@
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
+        NFS_LSEG_LAYOUTCOMMIT,  /* layoutcommit bit set for layoutcommit */
 };
 struct pnfs_layout_segment {
        struct list_head pls_list;
+        struct list_head pls_lc_list;
        struct pnfs_layout_range pls_range;
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
-        struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
-        loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
 };
 enum pnfs_try_status {
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {
        struct module *owner;
        unsigned flags;
+        int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+        int (*clear_layoutdriver) (struct nfs_server *);
        struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
        void (*free_layout_hdr) (struct pnfs_layout_hdr *);
@@ -110,6 +113,8 @@ struct pnfs_layoutdriver_type {
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutreturn_args *args);
+        void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
        void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutcommit_args *args);
@@ -125,6 +130,8 @@ struct pnfs_layout_hdr {
        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
        u32                     plh_barrier; /* ignore lower seqids */
        unsigned long           plh_flags;
+        loff_t                  plh_lwb; /* last write byte for layoutcommit */
+        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
        struct inode            *plh_inode;
 };
@@ -137,10 +144,21 @@ struct pnfs_device {
        unsigned int  pglen;
 };
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+struct pnfs_devicelist {
+        unsigned int            eof;
+        unsigned int            num_devs;
+        struct nfs4_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 /* nfs4proc.c */
+extern int nfs4_proc_getdevicelist(struct nfs_server *server,
+                                   const struct nfs_fh *fh,
+                                   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev);
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
@@ -153,7 +171,7 @@ void put_lseg(struct pnfs_layout_segment *lseg);
 bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
 bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
-void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -179,6 +197,7 @@ void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_ld_write_done(struct nfs_write_data *);
@@ -360,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)
        return false;
 }
-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+static inline void set_pnfs_layoutdriver(struct nfs_server *s,
+                                         const struct nfs_fh *mntfh, u32 id);
 {
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2011-07-31 12:26:50 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-07-31 12:26:50 -0400
commit	24c3047095fa3954f114bfff2e37b8fcbb216396 (patch)
tree	a2263a4425d511ae619ca8b055705261dab9ec12 /fs
parent	6581058f44533f9d45548bcfe986c125376859e9 (diff)
parent	71cdd40fd498f12679070def668f6a4719ddbd1c (diff)