323 files changed, 12718 insertions, 5323 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        }
        init_rwsem(&v9ses->rename_sem);
-        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&v9ses->bdi, "9p");
        if (rc) {
                kfree(v9ses->aname);
                kfree(v9ses->uname);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..ec35851e5b71 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
 source "fs/ext2/Kconfig"
 source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
-config FS_XIP
-# execute in place
-        bool
-        depends on EXT2_FS_XIP
-        default y
 source "fs/jbd/Kconfig"
 source "fs/jbd2/Kconfig"
@@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
+config FS_DAX
+        bool "Direct Access (DAX) support"
+        depends on MMU
+        depends on !(ARM || MIPS || SPARC)
+        help
+          Direct Access (DAX) can be used on memory-backed block devices.
+          If the block device supports DAX and the filesystem supports DAX,
+          then you can avoid using the pagecache to buffer I/Os.  Turning
+          on this option will compile in support for DAX; you will need to
+          mount the filesystem using the -o dax option.
+          If you do not have a block device that is capable of using this,
+          or if unsure, say N.  Saying Y will increase the size of the kernel
+          by about 5kB.
 endif # BLOCK
 # Posix ACL utility routines
@@ -165,6 +173,7 @@ config HUGETLB_PAGE
        def_bool HUGETLBFS
 source "fs/configfs/Kconfig"
+source "fs/efivarfs/Kconfig"
 endmenu
@@ -209,7 +218,6 @@ source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
 source "fs/f2fs/Kconfig"
-source "fs/efivarfs/Kconfig"
 endif # MISC_FILESYSTEMS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c055d56ec63d..270c48148f79 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -149,13 +149,6 @@ config BINFMT_EM86
          later load the module when you want to use a Linux/Intel binary. The
          module will be called binfmt_em86. If unsure, say Y.
-config BINFMT_SOM
-        tristate "Kernel support for SOM binaries"
-        depends on PARISC && HPUX
-        help
-          SOM is a binary executable format inherited from HP/UX.  Say
-          Y here to be able to load and execute SOM binaries directly.
 config BINFMT_MISC
        tristate "Kernel support for MISC binaries"
        ---help---
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..a88ac4838c9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)           += timerfd.o
 obj-$(CONFIG_EVENTFD)           += eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FS_DAX)            += dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)       += binfmt_aout.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_BINFMT_SCRIPT)	+= binfmt_script.o
 obj-$(CONFIG_BINFMT_ELF)        += binfmt_elf.o
 obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)  += binfmt_elf_fdpic.o
-obj-$(CONFIG_BINFMT_SOM)        += binfmt_som.o
 obj-$(CONFIG_BINFMT_FLAT)       += binfmt_flat.o
 obj-$(CONFIG_FS_MBCACHE)        += mbcache.o
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index ff44ff3ff015..c8764bd7497d 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -30,6 +30,8 @@
 #define AFFS_AC_SIZE            (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
 #define AFFS_AC_MASK            (AFFS_AC_SIZE-1)
+#define AFFSNAMEMAX 30U
 struct affs_ext_key {
        u32     ext;                            /* idx of the extended block */
        u32     key;                            /* block number */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index c852f2fa1710..388da1ea815d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -30,7 +30,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
        ino = bh->b_blocknr;
        offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
-        pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino);
+        pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino);
        dir_bh = affs_bread(sb, dir->i_ino);
        if (!dir_bh)
@@ -80,8 +80,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
        sb = dir->i_sb;
        rem_ino = rem_bh->b_blocknr;
        offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
-        pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n",
+        pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino,
-                 __func__, (u32)dir->i_ino, rem_ino, offset);
+                 rem_ino, offset);
        bh = affs_bread(sb, dir->i_ino);
        if (!bh)
@@ -483,11 +483,10 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
 {
        int      i;
-        if (len > 30) {
+        if (len > AFFSNAMEMAX) {
                if (notruncate)
                        return -ENAMETOOLONG;
-                else
+                len = AFFSNAMEMAX;
-                        len = 30;
        }
        for (i = 0; i < len; i++) {
                if (name[i] < ' ' || name[i] == ':'
@@ -508,7 +507,7 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
 int
 affs_copy_name(unsigned char *bstr, struct dentry *dentry)
 {
-        int len = min(dentry->d_name.len, 30u);
+        u32 len = min(dentry->d_name.len, AFFSNAMEMAX);
        *bstr++ = len;
        memcpy(bstr, dentry->d_name.name, len);
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c8de51185c23..675148950fed 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -99,7 +99,6 @@ err_bh_read:
 err_range:
        affs_error(sb, "affs_free_block","Block %u outside partition", block);
-        return;
 }
 /*
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 59f07bec92a6..ac4f318aafba 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -54,8 +54,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
        u32                      ino;
        int                      error = 0;
-        pr_debug("%s(ino=%lu,f_pos=%lx)\n",
+        pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
-                 __func__, inode->i_ino, (unsigned long)ctx->pos);
        if (ctx->pos < 2) {
                file->private_data = (void *)0;
@@ -115,11 +114,11 @@ inside:
                                break;
                        }
-                        namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
+                        namelen = min(AFFS_TAIL(sb, fh_bh)->name[0],
+                                      (u8)AFFSNAMEMAX);
                        name = AFFS_TAIL(sb, fh_bh)->name + 1;
-                        pr_debug("readdir(): dir_emit(\"%.*s\", "
+                        pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n",
-                                 "ino=%u), hash=%d, f_pos=%x\n",
+                                 namelen, name, ino, hash_pos, ctx->pos);
-                                 namelen, name, ino, hash_pos, (u32)ctx->pos);
                        if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
                                goto done;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 8faa6593ca6d..d2468bf95669 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -180,8 +180,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
                ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
                if (ext < AFFS_I(inode)->i_extcnt)
                        goto read_ext;
-                if (ext > AFFS_I(inode)->i_extcnt)
+                BUG_ON(ext > AFFS_I(inode)->i_extcnt);
-                        BUG();
                bh = affs_alloc_extblock(inode, bh, ext);
                if (IS_ERR(bh))
                        return bh;
@@ -198,8 +197,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
                struct buffer_head *prev_bh;
                /* allocate a new extended block */
-                if (ext > AFFS_I(inode)->i_extcnt)
+                BUG_ON(ext > AFFS_I(inode)->i_extcnt);
-                        BUG();
                /* get previous extended block */
                prev_bh = affs_get_extblock(inode, ext - 1);
@@ -299,8 +297,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
        struct buffer_head      *ext_bh;
        u32                      ext;
-        pr_debug("%s(%u, %lu)\n",
+        pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino,
-                 __func__, (u32)inode->i_ino, (unsigned long)block);
+                 (unsigned long long)block);
        BUG_ON(block > (sector_t)0x7fffffffUL);
@@ -330,8 +328,9 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
                /* store new block */
                if (bh_result->b_blocknr)
-                        affs_warning(sb, "get_block", "block already set (%lx)",
+                        affs_warning(sb, "get_block",
-                                     (unsigned long)bh_result->b_blocknr);
+                                     "block already set (%llx)",
+                                     (unsigned long long)bh_result->b_blocknr);
                AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
                AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
                affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
@@ -353,8 +352,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
        return 0;
 err_big:
-        affs_error(inode->i_sb, "get_block", "strange block request %d",
+        affs_error(inode->i_sb, "get_block", "strange block request %llu",
-                   (int)block);
+                   (unsigned long long)block);
        return -EIO;
 err_ext:
        // unlock cache
@@ -399,6 +398,13 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        ssize_t ret;
+        if (rw == WRITE) {
+                loff_t size = offset + count;
+                if (AFFS_I(inode)->mmu_private < size)
+                        return 0;
+        }
        ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
        if (ret < 0 && (rw & WRITE))
                affs_write_failed(mapping, offset + count);
@@ -503,7 +509,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
        u32 bidx, boff, bsize;
        u32 tmp;
-        pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
+        pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
                 page->index, to);
        BUG_ON(to > PAGE_CACHE_SIZE);
        kmap(page);
@@ -539,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
        u32 size, bsize;
        u32 tmp;
-        pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
+        pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize);
        bsize = AFFS_SB(sb)->s_data_blksize;
        bh = NULL;
        size = AFFS_I(inode)->mmu_private;
@@ -608,7 +614,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
        u32 to;
        int err;
-        pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
+        pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
        to = PAGE_CACHE_SIZE;
        if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
                to = inode->i_size & ~PAGE_CACHE_MASK;
@@ -631,8 +637,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
        pgoff_t index;
        int err = 0;
-        pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
+        pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
-                 (unsigned long long)pos, (unsigned long long)pos + len);
+                 pos + len);
        if (pos > AFFS_I(inode)->mmu_private) {
                /* XXX: this probably leaves a too-big i_size in case of
                 * failure. Should really be updating i_size at write_end time
@@ -681,9 +687,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
         * due to write_begin.
         */
-        pr_debug("%s(%u, %llu, %llu)\n",
+        pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
-                 __func__, (u32)inode->i_ino, (unsigned long long)pos,
+                 pos + len);
-                (unsigned long long)pos + len);
        bsize = AFFS_SB(sb)->s_data_blksize;
        data = page_address(page);
@@ -831,8 +836,8 @@ affs_truncate(struct inode *inode)
        struct buffer_head *ext_bh;
        int i;
-        pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
+        pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n",
-                 (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
+                 inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size);
        last_blk = 0;
        ext = 0;
@@ -863,7 +868,7 @@ affs_truncate(struct inode *inode)
        if (IS_ERR(ext_bh)) {
                affs_warning(sb, "truncate",
                             "unexpected read error for ext block %u (%ld)",
-                             (unsigned int)ext, PTR_ERR(ext_bh));
+                             ext, PTR_ERR(ext_bh));
                return;
        }
        if (AFFS_I(inode)->i_lc) {
@@ -911,7 +916,7 @@ affs_truncate(struct inode *inode)
                        if (IS_ERR(bh)) {
                                affs_warning(sb, "truncate",
                                             "unexpected read error for last block %u (%ld)",
-                                             (unsigned int)ext, PTR_ERR(bh));
+                                             ext, PTR_ERR(bh));
                                return;
                        }
                        tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index d0609a282e1d..6f34510449e8 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -13,8 +13,6 @@
 #include <linux/gfp.h>
 #include "affs.h"
-extern const struct inode_operations affs_symlink_inode_operations;
 struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 {
        struct affs_sb_info     *sbi = AFFS_SB(sb);
@@ -348,9 +346,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
        u32 block = 0;
        int retval;
-        pr_debug("%s(dir=%u, inode=%u, \"%pd\", type=%d)\n",
+        pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__,
-                 __func__, (u32)dir->i_ino,
+                 dir->i_ino, inode->i_ino, dentry, type);
-                 (u32)inode->i_ino, dentry, type);
        retval = -EIO;
        bh = affs_bread(sb, inode->i_ino);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index bbc38530e924..ffb7bd82c2a5 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -64,15 +64,16 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
 {
        const u8 *name = qstr->name;
        unsigned long hash;
-        int i;
+        int retval;
+        u32 len;
-        i = affs_check_name(qstr->name, qstr->len, notruncate);
+        retval = affs_check_name(qstr->name, qstr->len, notruncate);
-        if (i)
+        if (retval)
-                return i;
+                return retval;
        hash = init_name_hash();
-        i = min(qstr->len, 30u);
+        len = min(qstr->len, AFFSNAMEMAX);
-        for (; i > 0; name++, i--)
+        for (; len > 0; name++, len--)
                hash = partial_name_hash(toupper(*name), hash);
        qstr->hash = end_name_hash(hash);
@@ -114,10 +115,10 @@ static inline int __affs_compare_dentry(unsigned int len,
         * If the names are longer than the allowed 30 chars,
         * the excess is ignored, so their length may differ.
         */
-        if (len >= 30) {
+        if (len >= AFFSNAMEMAX) {
-                if (name->len < 30)
+                if (name->len < AFFSNAMEMAX)
                        return 1;
-                len = 30;
+                len = AFFSNAMEMAX;
        } else if (len != name->len)
                return 1;
@@ -156,10 +157,10 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
        const u8 *name = dentry->d_name.name;
        int len = dentry->d_name.len;
-        if (len >= 30) {
+        if (len >= AFFSNAMEMAX) {
-                if (*name2 < 30)
+                if (*name2 < AFFSNAMEMAX)
                        return 0;
-                len = 30;
+                len = AFFSNAMEMAX;
        } else if (len != *name2)
                return 0;
@@ -173,9 +174,9 @@ int
 affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
 {
        toupper_t toupper = affs_get_toupper(sb);
-        int hash;
+        u32 hash;
-        hash = len = min(len, 30u);
+        hash = len = min(len, AFFSNAMEMAX);
        for (; len > 0; len--)
                hash = (hash * 13 + toupper(*name++)) & 0x7ff;
@@ -248,9 +249,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 int
 affs_unlink(struct inode *dir, struct dentry *dentry)
 {
-        pr_debug("%s(dir=%d, %lu \"%pd\")\n",
+        pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
-                 __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
+                 dentry->d_inode->i_ino, dentry);
-                dentry);
        return affs_remove_header(dentry);
 }
@@ -317,9 +317,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 int
 affs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-        pr_debug("%s(dir=%u, %lu \"%pd\")\n",
+        pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
-                __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
+                 dentry->d_inode->i_ino, dentry);
-                 dentry);
        return affs_remove_header(dentry);
 }
@@ -404,8 +403,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
-        pr_debug("%s(%u, %u, \"%pd\")\n",
+        pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino,
-                 __func__, (u32)inode->i_ino, (u32)dir->i_ino,
                 dentry);
        return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -419,9 +417,8 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh = NULL;
        int retval;
-        pr_debug("%s(old=%u,\"%pd\" to new=%u,\"%pd\")\n",
+        pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
-                 __func__, (u32)old_dir->i_ino, old_dentry,
+                 old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
-                 (u32)new_dir->i_ino, new_dentry);
        retval = affs_check_name(new_dentry->d_name.name,
                                 new_dentry->d_name.len,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index f754ab68a840..4cf0e9113fb6 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -432,39 +432,39 @@ got_root:
                sb->s_flags |= MS_RDONLY;
        }
        switch (chksum) {
-                case MUFS_FS:
+        case MUFS_FS:
-                case MUFS_INTLFFS:
+        case MUFS_INTLFFS:
-                case MUFS_DCFFS:
+        case MUFS_DCFFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                        /* fall thru */
+                /* fall thru */
-                case FS_INTLFFS:
+        case FS_INTLFFS:
-                case FS_DCFFS:
+        case FS_DCFFS:
-                        sbi->s_flags |= SF_INTL;
+                sbi->s_flags |= SF_INTL;
-                        break;
+                break;
-                case MUFS_FFS:
+        case MUFS_FFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                        break;
+                break;
-                case FS_FFS:
+        case FS_FFS:
-                        break;
+                break;
-                case MUFS_OFS:
+        case MUFS_OFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                        /* fall thru */
+                /* fall thru */
-                case FS_OFS:
+        case FS_OFS:
-                        sbi->s_flags |= SF_OFS;
+                sbi->s_flags |= SF_OFS;
-                        sb->s_flags |= MS_NOEXEC;
+                sb->s_flags |= MS_NOEXEC;
-                        break;
+                break;
-                case MUFS_DCOFS:
+        case MUFS_DCOFS:
-                case MUFS_INTLOFS:
+        case MUFS_INTLOFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                case FS_DCOFS:
+        case FS_DCOFS:
-                case FS_INTLOFS:
+        case FS_INTLOFS:
-                        sbi->s_flags |= SF_INTL | SF_OFS;
+                sbi->s_flags |= SF_INTL | SF_OFS;
-                        sb->s_flags |= MS_NOEXEC;
+                sb->s_flags |= MS_NOEXEC;
-                        break;
+                break;
-                default:
+        default:
-                        pr_err("Unknown filesystem on device %s: %08X\n",
+                pr_err("Unknown filesystem on device %s: %08X\n",
-                               sb->s_id, chksum);
+                       sb->s_id, chksum);
-                        return -EINVAL;
+                return -EINVAL;
        }
        if (mount_flags & SF_VERBOSE) {
@@ -584,7 +584,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail  = free;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        buf->f_namelen = 30;
+        buf->f_namelen = AFFSNAMEMAX;
        return 0;
 }
@@ -602,6 +602,7 @@ static void affs_kill_sb(struct super_block *sb)
                affs_free_bitmap(sb);
                affs_brelse(sbi->s_root_bh);
                kfree(sbi->s_prefix);
+                mutex_destroy(&sbi->s_bmlock);
                kfree(sbi);
        }
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 06e14bfb3496..dbc732e9a5c0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
                        _debug("- range %u-%u%s",
                               offset, to, msg->msg_flags ? " [more]" : "");
-                        iov_iter_init(&msg->msg_iter, WRITE,
+                        iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC,
-                                      (struct iovec *) iov, 1, to - offset);
+                                      iov, 1, to - offset);
                        /* have to change the state *before* sending the last
                         * packet as RxRPC might give us the reply before it
@@ -384,7 +384,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1,
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
                      call->request_size);
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
@@ -770,7 +770,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
 void afs_send_empty_reply(struct afs_call *call)
 {
        struct msghdr msg;
-        struct iovec iov[1];
+        struct kvec iov[1];
        _enter("");
@@ -778,7 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
        iov[0].iov_len          = 0;
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0);     /* WTF? */
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
        msg.msg_flags           = 0;
@@ -805,7 +805,7 @@ void afs_send_empty_reply(struct afs_call *call)
 void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 {
        struct msghdr msg;
-        struct iovec iov[1];
+        struct kvec iov[1];
        int n;
        _enter("");
@@ -814,7 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
        iov[0].iov_len          = len;
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len);
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
        msg.msg_flags           = 0;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        volume->cell            = params->cell;
        volume->vid             = vlocation->vldb.vid[params->type];
-        ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+        ret = bdi_setup_and_register(&volume->bdi, "afs");
        if (ret)
                goto error_bdi;
diff --git a/fs/aio.c b/fs/aio.c
index c428871f1093..118a2e0088d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
-        .name           = "aiofs",
-        .state          = 0,
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
        struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
        inode->i_mapping->a_ops = &aio_ctx_aops;
        inode->i_mapping->private_data = ctx;
-        inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
        inode->i_size = PAGE_SIZE * nr_pages;
        path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +220,6 @@ static int __init aio_setup(void)
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
-        if (bdi_init(&aio_fs_backing_dev_info))
-                panic("Failed to init aio fs backing dev info.");
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index edf47774b03d..e089f1985fca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -274,9 +274,9 @@ more:
 static struct inode *
 befs_alloc_inode(struct super_block *sb)
 {
-        struct befs_inode_info *bi;
+        struct befs_inode_info *bi;
-        bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep,
-                                                        GFP_KERNEL);
+        bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
        if (!bi)
                return NULL;
        return &bi->vfs_inode;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
deleted file mode 100644
index 4e00ed68d4a6..000000000000
--- a/fs/binfmt_som.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * linux/fs/binfmt_som.c
- *
- * These are the functions used to load SOM format executables as used
- * by HP-UX.  
- *
- * Copyright 1999 Matthew Wilcox <willy@bofh.ai>
- * based on binfmt_elf which is
- * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
- */
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/binfmts.h>
-#include <linux/som.h>
-#include <linux/string.h>
-#include <linux/file.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/shm.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <linux/elf.h>
-static int load_som_binary(struct linux_binprm * bprm);
-static int load_som_library(struct file *);
-/*
- * If we don't support core dumping, then supply a NULL so we
- * don't even try.
- */
-#if 0
-static int som_core_dump(struct coredump_params *cprm);
-#else
-#define som_core_dump   NULL
-#endif
-#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1))
-#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1))
-#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1))
-static struct linux_binfmt som_format = {
-        .module         = THIS_MODULE,
-        .load_binary    = load_som_binary,
-        .load_shlib     = load_som_library,
-        .core_dump      = som_core_dump,
-        .min_coredump   = SOM_PAGESIZE
-};
-/*
- * create_som_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static void create_som_tables(struct linux_binprm *bprm)
-{
-        char **argv, **envp;
-        int argc = bprm->argc;
-        int envc = bprm->envc;
-        unsigned long p;
-        unsigned long *sp;
-        /* Word-align the stack pointer */
-        sp = (unsigned long *)((bprm->p + 3) & ~3);
-        envp = (char **) sp;
-        sp += envc + 1;
-        argv = (char **) sp;
-        sp += argc + 1;
-        __put_user((unsigned long) envp,++sp);
-        __put_user((unsigned long) argv,++sp);
-        __put_user(argc, ++sp);
-        bprm->p = (unsigned long) sp;
-        p = current->mm->arg_start;
-        while (argc-- > 0) {
-                __put_user((char *)p,argv++);
-                p += strlen_user((char *)p);
-        }
-        __put_user(NULL, argv);
-        current->mm->arg_end = current->mm->env_start = p;
-        while (envc-- > 0) {
-                __put_user((char *)p,envp++);
-                p += strlen_user((char *)p);
-        }
-        __put_user(NULL, envp);
-        current->mm->env_end = p;
-}
-static int check_som_header(struct som_hdr *som_ex)
-{
-        int *buf = (int *)som_ex;
-        int i, ck;
-        if (som_ex->system_id != SOM_SID_PARISC_1_0 &&
-            som_ex->system_id != SOM_SID_PARISC_1_1 &&
-            som_ex->system_id != SOM_SID_PARISC_2_0)
-                return -ENOEXEC;
-        if (som_ex->a_magic != SOM_EXEC_NONSHARE &&
-            som_ex->a_magic != SOM_EXEC_SHARE &&
-            som_ex->a_magic != SOM_EXEC_DEMAND)
-                return -ENOEXEC;
-        if (som_ex->version_id != SOM_ID_OLD &&
-            som_ex->version_id != SOM_ID_NEW)
-                return -ENOEXEC;
-        ck = 0;
-        for (i=0; i<32; i++)
-                ck ^= buf[i];
-        if (ck != 0)
-                return -ENOEXEC;
-        return 0;
-}
-static int map_som_binary(struct file *file,
-                const struct som_exec_auxhdr *hpuxhdr)
-{
-        unsigned long code_start, code_size, data_start, data_size;
-        unsigned long bss_start, som_brk;
-        int retval;
-        int prot = PROT_READ | PROT_EXEC;
-        int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
-        mm_segment_t old_fs = get_fs();
-        set_fs(get_ds());
-        code_start = SOM_PAGESTART(hpuxhdr->exec_tmem);
-        code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
-        current->mm->start_code = code_start;
-        current->mm->end_code = code_start + code_size;
-        retval = vm_mmap(file, code_start, code_size, prot,
-                        flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
-        if (retval < 0 && retval > -1024)
-                goto out;
-        data_start = SOM_PAGESTART(hpuxhdr->exec_dmem);
-        data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
-        current->mm->start_data = data_start;
-        current->mm->end_data = bss_start = data_start + data_size;
-        retval = vm_mmap(file, data_start, data_size,
-                        prot | PROT_WRITE, flags,
-                        SOM_PAGESTART(hpuxhdr->exec_dfile));
-        if (retval < 0 && retval > -1024)
-                goto out;
-        som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
-        current->mm->start_brk = current->mm->brk = som_brk;
-        retval = vm_mmap(NULL, bss_start, som_brk - bss_start,
-                        prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
-        if (retval > 0 || retval < -1024)
-                retval = 0;
-out:
-        set_fs(old_fs);
-        return retval;
-}
-/*
- * These are the functions used to load SOM executables and shared
- * libraries.  There is no binary dependent code anywhere else.
- */
-static int
-load_som_binary(struct linux_binprm * bprm)
-{
-        int retval;
-        unsigned int size;
-        unsigned long som_entry;
-        struct som_hdr *som_ex;
-        struct som_exec_auxhdr *hpuxhdr;
-        struct pt_regs *regs = current_pt_regs();
-        /* Get the exec-header */
-        som_ex = (struct som_hdr *) bprm->buf;
-        retval = check_som_header(som_ex);
-        if (retval != 0)
-                goto out;
-        /* Now read in the auxiliary header information */
-        retval = -ENOMEM;
-        size = som_ex->aux_header_size;
-        if (size > SOM_PAGESIZE)
-                goto out;
-        hpuxhdr = kmalloc(size, GFP_KERNEL);
-        if (!hpuxhdr)
-                goto out;
-        retval = kernel_read(bprm->file, som_ex->aux_header_location,
-                        (char *) hpuxhdr, size);
-        if (retval != size) {
-                if (retval >= 0)
-                        retval = -EIO;
-                goto out_free;
-        }
-        /* Flush all traces of the currently running executable */
-        retval = flush_old_exec(bprm);
-        if (retval)
-                goto out_free;
-        /* OK, This is the point of no return */
-        current->personality = PER_HPUX;
-        setup_new_exec(bprm);
-        /* Set the task size for HP-UX processes such that
-         * the gateway page is outside the address space.
-         * This can be fixed later, but for now, this is much
-         * easier.
-         */
-        current->thread.task_size = 0xc0000000;
-        /* Set map base to allow enough room for hp-ux heap growth */
-        current->thread.map_base = 0x80000000;
-        retval = map_som_binary(bprm->file, hpuxhdr);
-        if (retval < 0)
-                goto out_free;
-        som_entry = hpuxhdr->exec_entry;
-        kfree(hpuxhdr);
-        set_binfmt(&som_format);
-        install_exec_creds(bprm);
-        setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-        create_som_tables(bprm);
-        current->mm->start_stack = bprm->p;
-#if 0
-        printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
-        printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code);
-        printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code);
-        printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data);
-        printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack);
-        printk("(brk) %08lx\n" , (unsigned long) current->mm->brk);
-#endif
-        map_hpux_gateway_page(current,current->mm);
-        start_thread_som(regs, som_entry, bprm->p);
-        return 0;
-        /* error cleanup */
-out_free:
-        kfree(hpuxhdr);
-out:
-        return retval;
-}
-static int load_som_library(struct file *f)
-{
-/* No lib support in SOM yet.  gizza chance.. */
-        return -ENOEXEC;
-}
-        /* Install the SOM loader.
-         * N.B. We *rely* on the table being the right size with the
-         * right number of free slots...
-         */
-static int __init init_som_binfmt(void)
-{
-        register_binfmt(&som_format);
-        return 0;
-}
-static void __exit exit_som_binfmt(void)
-{
-        /* Remove the SOM loader. */
-        unregister_binfmt(&som_format);
-}
-core_initcall(init_som_binfmt);
-module_exit(exit_som_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..975266be67d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
-/*
+static void bdev_write_inode(struct inode *inode)
- * Move the inode from its current bdi to a new bdi.  Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
-                        struct backing_dev_info *dst)
 {
-        while (true) {
+        spin_lock(&inode->i_lock);
-                spin_lock(&inode->i_lock);
+        while (inode->i_state & I_DIRTY) {
-                if (!(inode->i_state & I_DIRTY)) {
-                        inode->i_data.backing_dev_info = dst;
-                        spin_unlock(&inode->i_lock);
-                        return;
-                }
                spin_unlock(&inode->i_lock);
                WARN_ON_ONCE(write_inode_now(inode, true));
+                spin_lock(&inode->i_lock);
        }
+        spin_unlock(&inode->i_lock);
 }
 /* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -429,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
+/**
+ * bdev_direct_access() - Get the address for directly-accessibly memory
+ * @bdev: The device containing the memory
+ * @sector: The offset within the device
+ * @addr: Where to put the address of the memory
+ * @pfn: The Page Frame Number for the memory
+ * @size: The number of bytes requested
+ *
+ * If a block device is made up of directly addressable memory, this function
+ * will tell the caller the PFN and the address of the memory.  The address
+ * may be directly dereferenced within the kernel without the need to call
+ * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
+ * page tables.
+ *
+ * Return: negative errno if an error occurs, otherwise the number of bytes
+ * accessible at this address.
+ */
+long bdev_direct_access(struct block_device *bdev, sector_t sector,
+                        void **addr, unsigned long *pfn, long size)
+{
+        long avail;
+        const struct block_device_operations *ops = bdev->bd_disk->fops;
+        if (size < 0)
+                return size;
+        if (!ops->direct_access)
+                return -EOPNOTSUPP;
+        if ((sector + DIV_ROUND_UP(size, 512)) >
+                                        part_nr_sects_read(bdev->bd_part))
+                return -ERANGE;
+        sector += get_start_sect(bdev);
+        if (sector % (PAGE_SIZE / 512))
+                return -EINVAL;
+        avail = ops->direct_access(bdev, sector, addr, pfn, size);
+        if (!avail)
+                return -ERANGE;
+        return min(avail, size);
+}
+EXPORT_SYMBOL_GPL(bdev_direct_access);
 /*
 * pseudo-fs
 */
@@ -584,7 +616,6 @@ struct block_device *bdget(dev_t dev)
                inode->i_bdev = bdev;
                inode->i_data.a_ops = &def_blk_aops;
                mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-                inode->i_data.backing_dev_info = &default_backing_dev_info;
                spin_lock(&bdev_lock);
                list_add(&bdev->bd_list, &all_bdevs);
                spin_unlock(&bdev_lock);
@@ -1145,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
                if (!partno) {
-                        struct backing_dev_info *bdi;
                        ret = -ENXIO;
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!bdev->bd_part)
@@ -1172,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                }
                        }
-                        if (!ret) {
+                        if (!ret)
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-                                bdi = blk_get_backing_dev_info(bdev);
-                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
-                        }
                        /*
                         * If the device is invalidated, rescan partition
@@ -1203,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = whole;
-                        bdev_inode_switch_bdi(bdev->bd_inode,
-                                whole->bd_inode->i_data.backing_dev_info);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1244,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
        bdev->bd_queue = NULL;
-        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1464,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
-                /* ->release can cause the old bdi to disappear,
+                /*
-                 * so must switch it out first
+                 * ->release can cause the queue to disappear, so flush all
+                 * dirty data before.
                 */
-                bdev_inode_switch_bdi(bdev->bd_inode,
+                bdev_write_inode(bdev->bd_inode);
-                                        &default_backing_dev_info);
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a66768ebc8d1..80e9c18ea64f 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -8,6 +8,7 @@ config BTRFS_FS
        select LZO_DECOMPRESS
        select RAID6_PQ
        select XOR_BLOCKS
+        select SRCU
        help
          Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..1afb18226da8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1715,12 +1715,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
-        bdi->capabilities = BDI_CAP_MAP_COPY;
+        err = bdi_setup_and_register(bdi, "btrfs");
-        err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
        if (err)
                return err;
-        bdi->ra_pages   = default_backing_dev_info.ra_pages;
+        bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@ -2319,7 +2318,6 @@ int open_ctree(struct super_block *sb,
         */
        fs_info->btree_inode->i_size = OFFSET_MAX;
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 790dbae3343c..c73df6a7c9b6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1407,8 +1407,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
        while (index <= end_index) {
                page = find_get_page(inode->i_mapping, index);
                BUG_ON(!page); /* Pages should be in the extent_io_tree */
-                account_page_redirty(page);
                __set_page_dirty_nobuffers(page);
+                account_page_redirty(page);
                page_cache_release(page);
                index++;
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b78bbbac900d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
-        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err) {
                mutex_unlock(&inode->i_mutex);
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = btrfs_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bf326affb94..54bcf639d1cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3608,7 +3608,6 @@ cache_acl:
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
                inode->i_fop = &btrfs_file_operations;
                inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3622,6 @@ cache_acl:
        case S_IFLNK:
                inode->i_op = &btrfs_symlink_inode_operations;
                inode->i_mapping->a_ops = &btrfs_symlink_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                break;
        default:
                inode->i_op = &btrfs_special_inode_operations;
@@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
@@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_op = &btrfs_symlink_inode_operations;
        inode->i_mapping->a_ops = &btrfs_symlink_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
@@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7d05e37874d4..fd5599d32362 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1574,7 +1574,6 @@ out:
 static struct vm_operations_struct ceph_vmops = {
        .fault          = ceph_filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 848969ee24db..a3d774b35149 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -952,7 +952,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        mutex_lock(&inode->i_mutex);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = file->f_mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be3af18e4cf1..119c43c80638 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
        }
        inode->i_mapping->a_ops = &ceph_aops;
-        inode->i_mapping->backing_dev_info =
-                &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index c35c5c614e38..4347039ecc18 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -239,23 +239,26 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
        return err;
 }
-/**
+/*
- * Must be called with lock_flocks() already held. Fills in the passed
+ * Fills in the passed counter variables, so you can prepare pagelist metadata
- * counter variables, so you can prepare pagelist metadata before calling
+ * before calling ceph_encode_locks.
- * ceph_encode_locks.
 */
 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 {
        struct file_lock *lock;
+        struct file_lock_context *ctx;
        *fcntl_count = 0;
        *flock_count = 0;
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        ctx = inode->i_flctx;
-                if (lock->fl_flags & FL_POSIX)
+        if (ctx) {
+                spin_lock(&ctx->flc_lock);
+                list_for_each_entry(lock, &ctx->flc_posix, fl_list)
                        ++(*fcntl_count);
-                else if (lock->fl_flags & FL_FLOCK)
+                list_for_each_entry(lock, &ctx->flc_flock, fl_list)
                        ++(*flock_count);
+                spin_unlock(&ctx->flc_lock);
        }
        dout("counted %d flock locks and %d fcntl locks",
             *flock_count, *fcntl_count);
@@ -271,6 +274,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
                                int num_fcntl_locks, int num_flock_locks)
 {
        struct file_lock *lock;
+        struct file_lock_context *ctx = inode->i_flctx;
        int err = 0;
        int seen_fcntl = 0;
        int seen_flock = 0;
@@ -279,33 +283,34 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
        dout("encoding %d flock and %d fcntl locks", num_flock_locks,
             num_fcntl_locks);
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        if (!ctx)
-                if (lock->fl_flags & FL_POSIX) {
+                return 0;
-                        ++seen_fcntl;
-                        if (seen_fcntl > num_fcntl_locks) {
+        spin_lock(&ctx->flc_lock);
-                                err = -ENOSPC;
+        list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-                                goto fail;
+                ++seen_fcntl;
-                        }
+                if (seen_fcntl > num_fcntl_locks) {
-                        err = lock_to_ceph_filelock(lock, &flocks[l]);
+                        err = -ENOSPC;
-                        if (err)
+                        goto fail;
-                                goto fail;
-                        ++l;
                }
+                err = lock_to_ceph_filelock(lock, &flocks[l]);
+                if (err)
+                        goto fail;
+                ++l;
        }
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-                if (lock->fl_flags & FL_FLOCK) {
+                ++seen_flock;
-                        ++seen_flock;
+                if (seen_flock > num_flock_locks) {
-                        if (seen_flock > num_flock_locks) {
+                        err = -ENOSPC;
-                                err = -ENOSPC;
+                        goto fail;
-                                goto fail;
-                        }
-                        err = lock_to_ceph_filelock(lock, &flocks[l]);
-                        if (err)
-                                goto fail;
-                        ++l;
                }
+                err = lock_to_ceph_filelock(lock, &flocks[l]);
+                if (err)
+                        goto fail;
+                ++l;
        }
 fail:
+        spin_unlock(&ctx->flc_lock);
        return err;
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4c1e36a171af..71c073f38e54 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2764,20 +2764,16 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                struct ceph_filelock *flocks;
 encode_again:
-                spin_lock(&inode->i_lock);
                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-                spin_unlock(&inode->i_lock);
                flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
                                 sizeof(struct ceph_filelock), GFP_NOFS);
                if (!flocks) {
                        err = -ENOMEM;
                        goto out_free;
                }
-                spin_lock(&inode->i_lock);
                err = ceph_encode_locks_to_buffer(inode, flocks,
                                                  num_fcntl_locks,
                                                  num_flock_locks);
-                spin_unlock(&inode->i_lock);
                if (err) {
                        kfree(flocks);
                        if (err == -ENOSPC)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8f8983f38b82..a63997b8bcff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s)
        dout("put_super\n");
        ceph_mdsc_close_sessions(fsc->mdsc);
-        /*
-         * ensure we release the bdi before put_anon_super releases
-         * the device name.
-         */
-        if (s->s_bdi == &fsc->backing_dev_info) {
-                bdi_unregister(&fsc->backing_dev_info);
-                s->s_bdi = NULL;
-        }
-        return;
 }
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -914,7 +903,7 @@ static int ceph_register_bdi(struct super_block *sb,
                        >> PAGE_SHIFT;
        else
                fsc->backing_dev_info.ra_pages =
-                        default_backing_dev_info.ra_pages;
+                        VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
                           atomic_long_inc_return(&bdi_seq));
@@ -1006,11 +995,16 @@ out_final:
 static void ceph_kill_sb(struct super_block *s)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+        dev_t dev = s->s_dev;
        dout("kill_sb %p\n", s);
        ceph_mdsc_pre_umount(fsc->mdsc);
-        kill_anon_super(s);    /* will call put_super after sb is r/o */
+        generic_shutdown_super(s);
        ceph_mdsc_destroy(fsc);
        destroy_fs_client(fsc);
+        free_anon_bdev(dev);
 }
 static struct file_system_type ceph_fs_type = {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
 #include "internal.h"
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
-        .name = "char",
-        .capabilities   = (
-#ifdef CONFIG_MMU
-                /* permit private copies of the data to be taken */
-                BDI_CAP_MAP_COPY |
-#endif
-                /* permit direct mmap, for read, write or exec */
-                BDI_CAP_MAP_DIRECT |
-                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
-                /* no writeback happens */
-                BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
 static struct kobj_map *cdev_map;
 static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
        cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-        if (bdi_init(&directly_mappable_cdev_bdi))
-                panic("Failed to init directly mappable cdev bdi");
 }
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
        int referral_walks_count = 0;
 #endif
-        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
        if (rc)
                return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74f12877493a..a94b3e673182 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1113,11 +1113,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        return rc;
 }
-/* copied from fs/locks.c with a name change */
-#define cifs_for_each_lock(inode, lockp) \
-        for (lockp = &inode->i_flock; *lockp != NULL; \
-             lockp = &(*lockp)->fl_next)
 struct lock_to_push {
        struct list_head llist;
        __u64 offset;
@@ -1132,8 +1127,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
        struct inode *inode = cfile->dentry->d_inode;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-        struct file_lock *flock, **before;
+        struct file_lock *flock;
-        unsigned int count = 0, i = 0;
+        struct file_lock_context *flctx = inode->i_flctx;
+        unsigned int count = 0, i;
        int rc = 0, xid, type;
        struct list_head locks_to_send, *el;
        struct lock_to_push *lck, *tmp;
@@ -1141,12 +1137,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
-        spin_lock(&inode->i_lock);
+        if (!flctx)
-        cifs_for_each_lock(inode, before) {
+                goto out;
-                if ((*before)->fl_flags & FL_POSIX)
-                        count++;
+        spin_lock(&flctx->flc_lock);
+        list_for_each(el, &flctx->flc_posix) {
+                count++;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        INIT_LIST_HEAD(&locks_to_send);
@@ -1155,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
         * added to the list while we are holding cinode->lock_sem that
         * protects locking operations of this inode.
         */
-        for (; i < count; i++) {
+        for (i = 0; i < count; i++) {
                lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
                if (!lck) {
                        rc = -ENOMEM;
@@ -1165,11 +1163,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        }
        el = locks_to_send.next;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        cifs_for_each_lock(inode, before) {
+        list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
-                flock = *before;
-                if ((flock->fl_flags & FL_POSIX) == 0)
-                        continue;
                if (el == &locks_to_send) {
                        /*
                         * The list ended. We don't have enough allocated
@@ -1189,9 +1184,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
                lck->length = length;
                lck->type = type;
                lck->offset = flock->fl_start;
-                el = el->next;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
                int stored_rc;
@@ -3248,7 +3242,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = cifs_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
                if (inode->i_state & I_NEW) {
                        inode->i_ino = hash;
-                        if (S_ISREG(inode->i_mode))
-                                inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
                        /* initialize per-inode cache cookie pointer */
                        CIFS_I(inode)->fscache = NULL;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 86c893884eb9..281ee011bb6a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -28,29 +28,6 @@
 #include "coda_int.h"
-/* dir inode-ops */
-static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl);
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags);
-static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, 
-                     struct dentry *entry);
-static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
-static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
-                        const char *symname);
-static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
-static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
-static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, 
-                       struct inode *new_inode, struct dentry *new_dentry);
-/* dir file-ops */
-static int coda_readdir(struct file *file, struct dir_context *ctx);
-/* dentry ops */
-static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
-static int coda_dentry_delete(const struct dentry *);
-/* support routines */
-static int coda_venus_readdir(struct file *, struct dir_context *);
 /* same as fs/bad_inode.c */
 static int coda_return_EIO(void)
 {
@@ -58,38 +35,6 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
-const struct dentry_operations coda_dentry_operations =
-{
-        .d_revalidate   = coda_dentry_revalidate,
-        .d_delete       = coda_dentry_delete,
-};
-const struct inode_operations coda_dir_inode_operations =
-{
-        .create         = coda_create,
-        .lookup         = coda_lookup,
-        .link           = coda_link,
-        .unlink         = coda_unlink,
-        .symlink        = coda_symlink,
-        .mkdir          = coda_mkdir,
-        .rmdir          = coda_rmdir,
-        .mknod          = CODA_EIO_ERROR,
-        .rename         = coda_rename,
-        .permission     = coda_permission,
-        .getattr        = coda_getattr,
-        .setattr        = coda_setattr,
-};
-const struct file_operations coda_dir_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .iterate        = coda_readdir,
-        .open           = coda_open,
-        .release        = coda_release,
-        .fsync          = coda_fsync,
-};
 /* inode operations for directories */
 /* access routines: lookup, readlink, permission */
 static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
@@ -374,33 +319,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        return error;
 }
-/* file operations for directories */
-static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
-{
-        struct coda_file_info *cfi;
-        struct file *host_file;
-        int ret;
-        cfi = CODA_FTOC(coda_file);
-        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
-        host_file = cfi->cfi_container;
-        if (host_file->f_op->iterate) {
-                struct inode *host_inode = file_inode(host_file);
-                mutex_lock(&host_inode->i_mutex);
-                ret = -ENOENT;
-                if (!IS_DEADDIR(host_inode)) {
-                        ret = host_file->f_op->iterate(host_file, ctx);
-                        file_accessed(host_file);
-                }
-                mutex_unlock(&host_inode->i_mutex);
-                return ret;
-        }
-        /* Venus: we must read Venus dirents from a file */
-        return coda_venus_readdir(coda_file, ctx);
-}
 static inline unsigned int CDT2DT(unsigned char cdt)
 {
        unsigned int dt;
@@ -495,6 +413,33 @@ out:
        return 0;
 }
+/* file operations for directories */
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
+{
+        struct coda_file_info *cfi;
+        struct file *host_file;
+        int ret;
+        cfi = CODA_FTOC(coda_file);
+        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+        host_file = cfi->cfi_container;
+        if (host_file->f_op->iterate) {
+                struct inode *host_inode = file_inode(host_file);
+                mutex_lock(&host_inode->i_mutex);
+                ret = -ENOENT;
+                if (!IS_DEADDIR(host_inode)) {
+                        ret = host_file->f_op->iterate(host_file, ctx);
+                        file_accessed(host_file);
+                }
+                mutex_unlock(&host_inode->i_mutex);
+                return ret;
+        }
+        /* Venus: we must read Venus dirents from a file */
+        return coda_venus_readdir(coda_file, ctx);
+}
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
 {
@@ -603,3 +548,32 @@ int coda_revalidate_inode(struct inode *inode)
        }
        return 0;
 }
+const struct dentry_operations coda_dentry_operations = {
+        .d_revalidate   = coda_dentry_revalidate,
+        .d_delete       = coda_dentry_delete,
+};
+const struct inode_operations coda_dir_inode_operations = {
+        .create         = coda_create,
+        .lookup         = coda_lookup,
+        .link           = coda_link,
+        .unlink         = coda_unlink,
+        .symlink        = coda_symlink,
+        .mkdir          = coda_mkdir,
+        .rmdir          = coda_rmdir,
+        .mknod          = CODA_EIO_ERROR,
+        .rename         = coda_rename,
+        .permission     = coda_permission,
+        .getattr        = coda_getattr,
+        .setattr        = coda_setattr,
+};
+const struct file_operations coda_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .iterate        = coda_readdir,
+        .open           = coda_open,
+        .release        = coda_release,
+        .fsync          = coda_fsync,
+};
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
                goto unlock_out;
        }
-        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+        error = bdi_setup_and_register(&vc->bdi, "coda");
        if (error)
                goto unlock_out;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
 extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
        .write_end      = simple_write_end,
 };
-static struct backing_dev_info configfs_backing_dev_info = {
-        .name           = "configfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static const struct inode_operations configfs_inode_operations ={
        .setattr        = configfs_setattr,
 };
@@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
        if (inode) {
                inode->i_ino = get_next_ino();
                inode->i_mapping->a_ops = &configfs_aops;
-                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
                if (sd->s_iattr) {
@@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
        }
        mutex_unlock(&dir->d_inode->i_mutex);
 }
-int __init configfs_inode_init(void)
-{
-        return bdi_init(&configfs_backing_dev_info);
-}
-void configfs_inode_exit(void)
-{
-        bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
        if (!config_kobj)
                goto out2;
-        err = configfs_inode_init();
-        if (err)
-                goto out3;
        err = register_filesystem(&configfs_fs_type);
        if (err)
-                goto out4;
+                goto out3;
        return 0;
-out4:
-        pr_err("Unable to register filesystem!\n");
-        configfs_inode_exit();
 out3:
+        pr_err("Unable to register filesystem!\n");
        kobject_put(config_kobj);
 out2:
        kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
        kobject_put(config_kobj);
        kmem_cache_destroy(configfs_dir_cachep);
        configfs_dir_cachep = NULL;
-        configfs_inode_exit();
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..ed1619ec6537
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,534 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <linux/vmstat.h>
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        sector_t sector = block << (inode->i_blkbits - 9);
+        might_sleep();
+        do {
+                void *addr;
+                unsigned long pfn;
+                long count;
+                count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+                if (count < 0)
+                        return count;
+                BUG_ON(size < count);
+                while (count > 0) {
+                        unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+                        if (pgsz > count)
+                                pgsz = count;
+                        if (pgsz < PAGE_SIZE)
+                                memset(addr, 0, pgsz);
+                        else
+                                clear_page(addr);
+                        addr += pgsz;
+                        size -= pgsz;
+                        count -= pgsz;
+                        BUG_ON(pgsz & 511);
+                        sector += pgsz / 512;
+                        cond_resched();
+                }
+        } while (size);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+        unsigned long pfn;
+        sector_t sector = bh->b_blocknr << (blkbits - 9);
+        return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+                        loff_t end)
+{
+        loff_t final = end - pos + first; /* The final byte of the buffer */
+        if (first > 0)
+                memset(addr, 0, first);
+        if (final < size)
+                memset(addr + final, 0, size - final);
+}
+static bool buffer_written(struct buffer_head *bh)
+{
+        return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size.  To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+        return bh->b_state != 0;
+}
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+                        loff_t start, loff_t end, get_block_t get_block,
+                        struct buffer_head *bh)
+{
+        ssize_t retval = 0;
+        loff_t pos = start;
+        loff_t max = start;
+        loff_t bh_max = start;
+        void *addr;
+        bool hole = false;
+        if (rw != WRITE)
+                end = min(end, i_size_read(inode));
+        while (pos < end) {
+                unsigned len;
+                if (pos == max) {
+                        unsigned blkbits = inode->i_blkbits;
+                        sector_t block = pos >> blkbits;
+                        unsigned first = pos - (block << blkbits);
+                        long size;
+                        if (pos == bh_max) {
+                                bh->b_size = PAGE_ALIGN(end - pos);
+                                bh->b_state = 0;
+                                retval = get_block(inode, block, bh,
+                                                                rw == WRITE);
+                                if (retval)
+                                        break;
+                                if (!buffer_size_valid(bh))
+                                        bh->b_size = 1 << blkbits;
+                                bh_max = pos - first + bh->b_size;
+                        } else {
+                                unsigned done = bh->b_size -
+                                                (bh_max - (pos - first));
+                                bh->b_blocknr += done >> blkbits;
+                                bh->b_size -= done;
+                        }
+                        hole = (rw != WRITE) && !buffer_written(bh);
+                        if (hole) {
+                                addr = NULL;
+                                size = bh->b_size - first;
+                        } else {
+                                retval = dax_get_addr(bh, &addr, blkbits);
+                                if (retval < 0)
+                                        break;
+                                if (buffer_unwritten(bh) || buffer_new(bh))
+                                        dax_new_buf(addr, retval, first, pos,
+                                                                        end);
+                                addr += first;
+                                size = retval - first;
+                        }
+                        max = min(pos + size, end);
+                }
+                if (rw == WRITE)
+                        len = copy_from_iter(addr, max - pos, iter);
+                else if (!hole)
+                        len = copy_to_iter(addr, max - pos, iter);
+                else
+                        len = iov_iter_zero(max - pos, iter);
+                if (!len)
+                        break;
+                pos += len;
+                addr += len;
+        }
+        return (pos == start) ? retval : pos - start;
+}
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes.  For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+                        struct iov_iter *iter, loff_t pos,
+                        get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+        struct buffer_head bh;
+        ssize_t retval = -EINVAL;
+        loff_t end = pos + iov_iter_count(iter);
+        memset(&bh, 0, sizeof(bh));
+        if ((flags & DIO_LOCKING) && (rw == READ)) {
+                struct address_space *mapping = inode->i_mapping;
+                mutex_lock(&inode->i_mutex);
+                retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+                if (retval) {
+                        mutex_unlock(&inode->i_mutex);
+                        goto out;
+                }
+        }
+        /* Protects against truncate */
+        atomic_inc(&inode->i_dio_count);
+        retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+        if ((flags & DIO_LOCKING) && (rw == READ))
+                mutex_unlock(&inode->i_mutex);
+        if ((retval > 0) && end_io)
+                end_io(iocb, pos, retval, bh.b_private);
+        inode_dio_done(inode);
+ out:
+        return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
+/*
+ * The user has performed a load from a hole in the file.  Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files.  We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+                                                        struct vm_fault *vmf)
+{
+        unsigned long size;
+        struct inode *inode = mapping->host;
+        if (!page)
+                page = find_or_create_page(mapping, vmf->pgoff,
+                                                GFP_KERNEL | __GFP_ZERO);
+        if (!page)
+                return VM_FAULT_OOM;
+        /* Recheck i_size under page lock to avoid truncate race */
+        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        if (vmf->pgoff >= size) {
+                unlock_page(page);
+                page_cache_release(page);
+                return VM_FAULT_SIGBUS;
+        }
+        vmf->page = page;
+        return VM_FAULT_LOCKED;
+}
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+                        unsigned blkbits, unsigned long vaddr)
+{
+        void *vfrom, *vto;
+        if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+                return -EIO;
+        vto = kmap_atomic(to);
+        copy_user_page(vto, vfrom, vaddr, to);
+        kunmap_atomic(vto);
+        return 0;
+}
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+                        struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct address_space *mapping = inode->i_mapping;
+        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        void *addr;
+        unsigned long pfn;
+        pgoff_t size;
+        int error;
+        i_mmap_lock_read(mapping);
+        /*
+         * Check truncate didn't happen while we were allocating a block.
+         * If it did, this block may or may not be still allocated to the
+         * file.  We can't tell the filesystem to free it because we can't
+         * take i_mutex here.  In the worst case, the file still has blocks
+         * allocated past the end of the file.
+         */
+        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        if (unlikely(vmf->pgoff >= size)) {
+                error = -EIO;
+                goto out;
+        }
+        error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+        if (error < 0)
+                goto out;
+        if (error < PAGE_SIZE) {
+                error = -EIO;
+                goto out;
+        }
+        if (buffer_unwritten(bh) || buffer_new(bh))
+                clear_page(addr);
+        error = vm_insert_mixed(vma, vaddr, pfn);
+ out:
+        i_mmap_unlock_read(mapping);
+        if (bh->b_end_io)
+                bh->b_end_io(bh, 1);
+        return error;
+}
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                        get_block_t get_block)
+{
+        struct file *file = vma->vm_file;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        struct page *page;
+        struct buffer_head bh;
+        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        unsigned blkbits = inode->i_blkbits;
+        sector_t block;
+        pgoff_t size;
+        int error;
+        int major = 0;
+        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        if (vmf->pgoff >= size)
+                return VM_FAULT_SIGBUS;
+        memset(&bh, 0, sizeof(bh));
+        block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+        bh.b_size = PAGE_SIZE;
+ repeat:
+        page = find_get_page(mapping, vmf->pgoff);
+        if (page) {
+                if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+                        page_cache_release(page);
+                        return VM_FAULT_RETRY;
+                }
+                if (unlikely(page->mapping != mapping)) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                if (unlikely(vmf->pgoff >= size)) {
+                        /*
+                         * We have a struct page covering a hole in the file
+                         * from a read fault and we've raced with a truncate
+                         */
+                        error = -EIO;
+                        goto unlock_page;
+                }
+        }
+        error = get_block(inode, block, &bh, 0);
+        if (!error && (bh.b_size < PAGE_SIZE))
+                error = -EIO;           /* fs corruption? */
+        if (error)
+                goto unlock_page;
+        if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+                if (vmf->flags & FAULT_FLAG_WRITE) {
+                        error = get_block(inode, block, &bh, 1);
+                        count_vm_event(PGMAJFAULT);
+                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+                        major = VM_FAULT_MAJOR;
+                        if (!error && (bh.b_size < PAGE_SIZE))
+                                error = -EIO;
+                        if (error)
+                                goto unlock_page;
+                } else {
+                        return dax_load_hole(mapping, page, vmf);
+                }
+        }
+        if (vmf->cow_page) {
+                struct page *new_page = vmf->cow_page;
+                if (buffer_written(&bh))
+                        error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+                else
+                        clear_user_highpage(new_page, vaddr);
+                if (error)
+                        goto unlock_page;
+                vmf->page = page;
+                if (!page) {
+                        i_mmap_lock_read(mapping);
+                        /* Check we didn't race with truncate */
+                        size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+                                                                PAGE_SHIFT;
+                        if (vmf->pgoff >= size) {
+                                i_mmap_unlock_read(mapping);
+                                error = -EIO;
+                                goto out;
+                        }
+                }
+                return VM_FAULT_LOCKED;
+        }
+        /* Check we didn't race with a read fault installing a new page */
+        if (!page && major)
+                page = find_lock_page(mapping, vmf->pgoff);
+        if (page) {
+                unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+                                                        PAGE_CACHE_SIZE, 0);
+                delete_from_page_cache(page);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        error = dax_insert_mapping(inode, &bh, vma, vmf);
+ out:
+        if (error == -ENOMEM)
+                return VM_FAULT_OOM | major;
+        /* -EBUSY is fine, somebody else faulted on the same PTE */
+        if ((error < 0) && (error != -EBUSY))
+                return VM_FAULT_SIGBUS | major;
+        return VM_FAULT_NOPAGE | major;
+ unlock_page:
+        if (page) {
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        goto out;
+}
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                        get_block_t get_block)
+{
+        int result;
+        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+        if (vmf->flags & FAULT_FLAG_WRITE) {
+                sb_start_pagefault(sb);
+                file_update_time(vma->vm_file);
+        }
+        result = do_dax_fault(vma, vmf, get_block);
+        if (vmf->flags & FAULT_FLAG_WRITE)
+                sb_end_pagefault(sb);
+        return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
+/**
+ * dax_zero_page_range - zero a range within a page of a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file.  This is intended for hole-punch operations.  If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+                                                        get_block_t get_block)
+{
+        struct buffer_head bh;
+        pgoff_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        int err;
+        /* Block boundary? Nothing to do */
+        if (!length)
+                return 0;
+        BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+        memset(&bh, 0, sizeof(bh));
+        bh.b_size = PAGE_CACHE_SIZE;
+        err = get_block(inode, index, &bh, 0);
+        if (err < 0)
+                return err;
+        if (buffer_written(&bh)) {
+                void *addr;
+                err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+                if (err < 0)
+                        return err;
+                memset(addr + offset, 0, length);
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+        unsigned length = PAGE_CACHE_ALIGN(from) - from;
+        return dax_zero_page_range(inode, from, length, get_block);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..dc400fd29f4d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,6 +38,8 @@
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
+#include <linux/kasan.h>
 #include "internal.h"
 #include "mount.h"
@@ -400,19 +402,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
-static void d_lru_isolate(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
 {
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
-        list_del_init(&dentry->d_lru);
+        list_lru_isolate(lru, &dentry->d_lru);
 }
-static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+                              struct list_head *list)
 {
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
-        list_move_tail(&dentry->d_lru, list);
+        list_lru_isolate_move(lru, &dentry->d_lru, list);
 }
 /*
@@ -508,7 +511,7 @@ static void __dentry_kill(struct dentry *dentry)
         * dentry_iput drops the locks, at which point nobody (except
         * transient RCU lookups) can reach this dentry.
         */
-        BUG_ON((int)dentry->d_lockref.count > 0);
+        BUG_ON(dentry->d_lockref.count > 0);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
@@ -561,7 +564,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
        struct dentry *parent = dentry->d_parent;
        if (IS_ROOT(dentry))
                return NULL;
-        if (unlikely((int)dentry->d_lockref.count < 0))
+        if (unlikely(dentry->d_lockref.count < 0))
                return NULL;
        if (likely(spin_trylock(&parent->d_lock)))
                return parent;
@@ -590,6 +593,110 @@ again:
        return parent;
 }
+/*
+ * Try to do a lockless dput(), and return whether that was successful.
+ *
+ * If unsuccessful, we return false, having already taken the dentry lock.
+ *
+ * The caller needs to hold the RCU read lock, so that the dentry is
+ * guaranteed to stay around even if the refcount goes down to zero!
+ */
+static inline bool fast_dput(struct dentry *dentry)
+{
+        int ret;
+        unsigned int d_flags;
+        /*
+         * If we have a d_op->d_delete() operation, we sould not
+         * let the dentry count go to zero, so use "put__or_lock".
+         */
+        if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
+                return lockref_put_or_lock(&dentry->d_lockref);
+        /*
+         * .. otherwise, we can try to just decrement the
+         * lockref optimistically.
+         */
+        ret = lockref_put_return(&dentry->d_lockref);
+        /*
+         * If the lockref_put_return() failed due to the lock being held
+         * by somebody else, the fast path has failed. We will need to
+         * get the lock, and then check the count again.
+         */
+        if (unlikely(ret < 0)) {
+                spin_lock(&dentry->d_lock);
+                if (dentry->d_lockref.count > 1) {
+                        dentry->d_lockref.count--;
+                        spin_unlock(&dentry->d_lock);
+                        return 1;
+                }
+                return 0;
+        }
+        /*
+         * If we weren't the last ref, we're done.
+         */
+        if (ret)
+                return 1;
+        /*
+         * Careful, careful. The reference count went down
+         * to zero, but we don't hold the dentry lock, so
+         * somebody else could get it again, and do another
+         * dput(), and we need to not race with that.
+         *
+         * However, there is a very special and common case
+         * where we don't care, because there is nothing to
+         * do: the dentry is still hashed, it does not have
+         * a 'delete' op, and it's referenced and already on
+         * the LRU list.
+         *
+         * NOTE! Since we aren't locked, these values are
+         * not "stable". However, it is sufficient that at
+         * some point after we dropped the reference the
+         * dentry was hashed and the flags had the proper
+         * value. Other dentry users may have re-gotten
+         * a reference to the dentry and change that, but
+         * our work is done - we can leave the dentry
+         * around with a zero refcount.
+         */
+        smp_rmb();
+        d_flags = ACCESS_ONCE(dentry->d_flags);
+        d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
+        /* Nothing to do? Dropping the reference was all we needed? */
+        if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+                return 1;
+        /*
+         * Not the fast normal case? Get the lock. We've already decremented
+         * the refcount, but we'll need to re-check the situation after
+         * getting the lock.
+         */
+        spin_lock(&dentry->d_lock);
+        /*
+         * Did somebody else grab a reference to it in the meantime, and
+         * we're no longer the last user after all? Alternatively, somebody
+         * else could have killed it and marked it dead. Either way, we
+         * don't need to do anything else.
+         */
+        if (dentry->d_lockref.count) {
+                spin_unlock(&dentry->d_lock);
+                return 1;
+        }
+        /*
+         * Re-get the reference we optimistically dropped. We hold the
+         * lock, and we just tested that it was zero, so we can just
+         * set it to 1.
+         */
+        dentry->d_lockref.count = 1;
+        return 0;
+}
 /* 
 * This is dput
 *
@@ -622,8 +729,14 @@ void dput(struct dentry *dentry)
                return;
 repeat:
-        if (lockref_put_or_lock(&dentry->d_lockref))
+        rcu_read_lock();
+        if (likely(fast_dput(dentry))) {
+                rcu_read_unlock();
                return;
+        }
+        /* Slow case: now with the dentry lock held */
+        rcu_read_unlock();
        /* Unreachable? Get rid of it */
        if (unlikely(d_unhashed(dentry)))
@@ -810,7 +923,7 @@ static void shrink_dentry_list(struct list_head *list)
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup. Do not free it.
                 */
-                if ((int)dentry->d_lockref.count > 0) {
+                if (dentry->d_lockref.count > 0) {
                        spin_unlock(&dentry->d_lock);
                        if (parent)
                                spin_unlock(&parent->d_lock);
@@ -869,8 +982,8 @@ static void shrink_dentry_list(struct list_head *list)
        }
 }
-static enum lru_status
+static enum lru_status dentry_lru_isolate(struct list_head *item,
-dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *freeable = arg;
        struct dentry   *dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +1003,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
-                d_lru_isolate(dentry);
+                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }
@@ -921,7 +1034,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
                return LRU_ROTATE;
        }
-        d_lru_shrink_move(dentry, freeable);
+        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);
        return LRU_REMOVED;
@@ -930,30 +1043,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 /**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
- * @nr_to_scan : number of entries to try to free
+ * @sc: shrink control, passed to list_lru_shrink_walk()
- * @nid: which node to scan for freeable entities
 *
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
- * done when we need more memory an called from the superblock shrinker
+ * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
-                     int nid)
 {
        LIST_HEAD(dispose);
        long freed;
-        freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
+        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
-                                       &dispose, &nr_to_scan);
+                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
 }
 static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
-                                                spinlock_t *lru_lock, void *arg)
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *freeable = arg;
        struct dentry   *dentry = container_of(item, struct dentry, d_lru);
@@ -966,7 +1077,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;
-        d_lru_shrink_move(dentry, freeable);
+        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);
        return LRU_REMOVED;
@@ -1430,6 +1541,9 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
                }
                atomic_set(&p->u.count, 1);
                dname = p->name;
+                if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
+                        kasan_unpoison_shadow(dname,
+                                round_up(name->len + 1, sizeof(unsigned long)));
        } else  {
                dname = dentry->d_iname;
        }       
@@ -2187,37 +2301,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 }
 EXPORT_SYMBOL(d_hash_and_lookup);
-/**
- * d_validate - verify dentry provided from insecure source (deprecated)
- * @dentry: The dentry alleged to be valid child of @dparent
- * @dparent: The parent dentry (known to be valid)
- *
- * An insecure source has sent us a dentry, here we verify it and dget() it.
- * This is used by ncpfs in its readdir implementation.
- * Zero is returned in the dentry is invalid.
- *
- * This function is slow for big directories, and deprecated, do not use it.
- */
-int d_validate(struct dentry *dentry, struct dentry *dparent)
-{
-        struct dentry *child;
-        spin_lock(&dparent->d_lock);
-        list_for_each_entry(child, &dparent->d_subdirs, d_child) {
-                if (dentry == child) {
-                        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-                        __dget_dlock(dentry);
-                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dparent->d_lock);
-                        return 1;
-                }
-        }
-        spin_unlock(&dparent->d_lock);
-        return 0;
-}
-EXPORT_SYMBOL(d_validate);
 /*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 05f2960ed7c3..45b18a5e225c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
+static struct inode *debugfs_get_inode(struct super_block *sb)
-                                       void *data, const struct file_operations *fops)
 {
        struct inode *inode = new_inode(sb);
        if (inode) {
                inode->i_ino = get_next_ino();
-                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                switch (mode & S_IFMT) {
-                default:
-                        init_special_inode(inode, mode, dev);
-                        break;
-                case S_IFREG:
-                        inode->i_fop = fops ? fops : &debugfs_file_operations;
-                        inode->i_private = data;
-                        break;
-                case S_IFLNK:
-                        inode->i_op = &debugfs_link_operations;
-                        inode->i_private = data;
-                        break;
-                case S_IFDIR:
-                        inode->i_op = &simple_dir_inode_operations;
-                        inode->i_fop = &simple_dir_operations;
-                        /* directory inodes start off with i_nlink == 2
-                         * (for "." entry) */
-                        inc_nlink(inode);
-                        break;
-                }
        }
        return inode;
 }
-/* SMP-safe */
-static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
-                         umode_t mode, dev_t dev, void *data,
-                         const struct file_operations *fops)
-{
-        struct inode *inode;
-        int error = -EPERM;
-        if (dentry->d_inode)
-                return -EEXIST;
-        inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
-        if (inode) {
-                d_instantiate(dentry, inode);
-                dget(dentry);
-                error = 0;
-        }
-        return error;
-}
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-        int res;
-        mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
-        res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL);
-        if (!res) {
-                inc_nlink(dir);
-                fsnotify_mkdir(dir, dentry);
-        }
-        return res;
-}
-static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
-                        void *data)
-{
-        mode = (mode & S_IALLUGO) | S_IFLNK;
-        return debugfs_mknod(dir, dentry, mode, 0, data, NULL);
-}
-static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                          void *data, const struct file_operations *fops)
-{
-        int res;
-        mode = (mode & S_IALLUGO) | S_IFREG;
-        res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
-        if (!res)
-                fsnotify_create(dir, dentry);
-        return res;
-}
 static inline int debugfs_positive(struct dentry *dentry)
 {
        return dentry->d_inode && !d_unhashed(dentry);
@@ -252,6 +175,18 @@ static const struct super_operations debugfs_super_operations = {
        .show_options   = debugfs_show_options,
 };
+static struct vfsmount *debugfs_automount(struct path *path)
+{
+        struct vfsmount *(*f)(void *);
+        f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+        return f(path->dentry->d_inode->i_private);
+}
+static const struct dentry_operations debugfs_dops = {
+        .d_delete = always_delete_dentry,
+        .d_automount = debugfs_automount,
+};
 static int debug_fill_super(struct super_block *sb, void *data, int silent)
 {
        static struct tree_descr debug_files[] = {{""}};
@@ -276,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        sb->s_op = &debugfs_super_operations;
+        sb->s_d_op = &debugfs_dops;
        debugfs_apply_options(sb);
@@ -302,11 +238,9 @@ static struct file_system_type debug_fs_type = {
 };
 MODULE_ALIAS_FS("debugfs");
-static struct dentry *__create_file(const char *name, umode_t mode,
+static struct dentry *start_creating(const char *name, struct dentry *parent)
-                                    struct dentry *parent, void *data,
-                                    const struct file_operations *fops)
 {
-        struct dentry *dentry = NULL;
+        struct dentry *dentry;
        int error;
        pr_debug("debugfs: creating file '%s'\n",name);
@@ -314,7 +248,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
        error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
                              &debugfs_mount_count);
        if (error)
-                goto exit;
+                return ERR_PTR(error);
        /* If the parent is not specified, we create it in the root.
         * We need the root dentry to do this, which is in the super
@@ -326,31 +260,26 @@ static struct dentry *__create_file(const char *name, umode_t mode,
        mutex_lock(&parent->d_inode->i_mutex);
        dentry = lookup_one_len(name, parent, strlen(name));
-        if (!IS_ERR(dentry)) {
+        if (!IS_ERR(dentry) && dentry->d_inode) {
-                switch (mode & S_IFMT) {
-                case S_IFDIR:
-                        error = debugfs_mkdir(parent->d_inode, dentry, mode);
-                        break;
-                case S_IFLNK:
-                        error = debugfs_link(parent->d_inode, dentry, mode,
-                                             data);
-                        break;
-                default:
-                        error = debugfs_create(parent->d_inode, dentry, mode,
-                                               data, fops);
-                        break;
-                }
                dput(dentry);
-        } else
+                dentry = ERR_PTR(-EEXIST);
-                error = PTR_ERR(dentry);
-        mutex_unlock(&parent->d_inode->i_mutex);
-        if (error) {
-                dentry = NULL;
-                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        }
-exit:
+        if (IS_ERR(dentry))
+                mutex_unlock(&parent->d_inode->i_mutex);
+        return dentry;
+}
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+        dput(dentry);
+        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+        return NULL;
+}
+static struct dentry *end_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
        return dentry;
 }
@@ -384,19 +313,71 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
                                   struct dentry *parent, void *data,
                                   const struct file_operations *fops)
 {
-        switch (mode & S_IFMT) {
+        struct dentry *dentry;
-        case S_IFREG:
+        struct inode *inode;
-        case 0:
-                break;
+        if (!(mode & S_IFMT))
-        default:
+                mode |= S_IFREG;
-                BUG();
+        BUG_ON(!S_ISREG(mode));
-        }
+        dentry = start_creating(name, parent);
+        if (IS_ERR(dentry))
+                return NULL;
-        return __create_file(name, mode, parent, data, fops);
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = mode;
+        inode->i_fop = fops ? fops : &debugfs_file_operations;
+        inode->i_private = data;
+        d_instantiate(dentry, inode);
+        fsnotify_create(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file);
 /**
+ * debugfs_create_file_size - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ * @file_size: initial file size
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+                                        struct dentry *parent, void *data,
+                                        const struct file_operations *fops,
+                                        loff_t file_size)
+{
+        struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
+        if (de)
+                de->d_inode->i_size = file_size;
+        return de;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_size);
+/**
 * debugfs_create_dir - create a directory in the debugfs filesystem
 * @name: a pointer to a string containing the name of the directory to
 *        create.
@@ -416,12 +397,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
 */
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 {
-        return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+        struct dentry *dentry = start_creating(name, parent);
-                                   parent, NULL, NULL);
+        struct inode *inode;
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        inode->i_op = &simple_dir_inode_operations;
+        inode->i_fop = &simple_dir_operations;
+        /* directory inodes start off with i_nlink == 2 (for "." entry) */
+        inc_nlink(inode);
+        d_instantiate(dentry, inode);
+        inc_nlink(dentry->d_parent->d_inode);
+        fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_dir);
 /**
+ * debugfs_create_automount - create automount point in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @f: function to be called when pathname resolution steps on that one.
+ * @data: opaque argument to pass to f().
+ *
+ * @f should return what ->d_automount() would.
+ */
+struct dentry *debugfs_create_automount(const char *name,
+                                        struct dentry *parent,
+                                        struct vfsmount *(*f)(void *),
+                                        void *data)
+{
+        struct dentry *dentry = start_creating(name, parent);
+        struct inode *inode;
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        inode->i_flags |= S_AUTOMOUNT;
+        inode->i_private = data;
+        dentry->d_fsdata = (void *)f;
+        d_instantiate(dentry, inode);
+        return end_creating(dentry);
+}
+EXPORT_SYMBOL(debugfs_create_automount);
+/**
 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
 * @name: a pointer to a string containing the name of the symbolic link to
 *        create.
@@ -447,17 +481,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
                                      const char *target)
 {
-        struct dentry *result;
+        struct dentry *dentry;
-        char *link;
+        struct inode *inode;
+        char *link = kstrdup(target, GFP_KERNEL);
-        link = kstrdup(target, GFP_KERNEL);
        if (!link)
                return NULL;
-        result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL);
+        dentry = start_creating(name, parent);
-        if (!result)
+        if (IS_ERR(dentry)) {
                kfree(link);
-        return result;
+                return NULL;
+        }
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode)) {
+                kfree(link);
+                return failed_creating(dentry);
+        }
+        inode->i_mode = S_IFLNK | S_IRWXUGO;
+        inode->i_op = &debugfs_link_operations;
+        inode->i_private = link;
+        d_instantiate(dentry, inode);
+        return end_creating(dentry);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7cfbaf8d0e2..1e6e227134d7 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb)
 {
        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
        void *data = genlmsg_data(genlhdr);
-        int rv;
-        rv = genlmsg_end(skb, data);
+        genlmsg_end(skb, data);
-        if (rv < 0) {
-                nlmsg_free(skb);
-                return rv;
-        }
        return genlmsg_unicast(&init_net, skb, listener_nlportid);
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
        iput(toput_inode);
 }
-static void drop_slab(void)
-{
-        int nr_objects;
-        do {
-                int nid;
-                nr_objects = 0;
-                for_each_online_node(nid)
-                        nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
-                                                        1000, 1000);
-        } while (nr_objects > 10);
-}
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
        inode->i_ino = lower_inode->i_ino;
        inode->i_version++;
        inode->i_mapping->a_ops = &ecryptfs_aops;
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        if (S_ISLNK(inode->i_mode))
                inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
-        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
        if (rc)
                goto out1;
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index 367bbb10c543..c2499ef174a2 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,7 @@
 config EFIVAR_FS
        tristate "EFI Variable filesystem"
        depends on EFI
+        default m
        help
          efivarfs is a replacement filesystem for the old EFI
          variable support via sysfs, as it doesn't suffer from the
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
        name[len] = '-';
-        efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+        efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
        name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 4b0a226024fa..8d0c0df01854 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -118,18 +118,18 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
 {
        struct eventfd_ctx *ctx = file->private_data;
        unsigned int events = 0;
-        unsigned long flags;
+        u64 count;
        poll_wait(file, &ctx->wqh, wait);
+        smp_rmb();
+        count = ctx->count;
-        spin_lock_irqsave(&ctx->wqh.lock, flags);
+        if (count > 0)
-        if (ctx->count > 0)
                events |= POLLIN;
-        if (ctx->count == ULLONG_MAX)
+        if (count == ULLONG_MAX)
                events |= POLLERR;
-        if (ULLONG_MAX - 1 > ctx->count)
+        if (ULLONG_MAX - 1 > count)
                events |= POLLOUT;
-        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
        return events;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d77f94491352..1e009cad8d5c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1639,9 +1639,9 @@ fetch_events:
                        spin_lock_irqsave(&ep->lock, flags);
                }
-                __remove_wait_queue(&ep->wq, &wait);
-                set_current_state(TASK_RUNNING);
+                __remove_wait_queue(&ep->wq, &wait);
+                __set_current_state(TASK_RUNNING);
        }
 check_events:
        /* Is it worth to try to dig for events ? */
diff --git a/fs/exec.c b/fs/exec.c
index ad8798e26be9..c7f9b733406d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -794,8 +794,14 @@ exit:
 struct file *open_exec(const char *name)
 {
-        struct filename tmp = { .name = name };
+        struct filename *filename = getname_kernel(name);
-        return do_open_execat(AT_FDCWD, &tmp, 0);
+        struct file *f = ERR_CAST(filename);
+        if (!IS_ERR(filename)) {
+                f = do_open_execat(AT_FDCWD, filename, 0);
+                putname(filename);
+        }
+        return f;
 }
 EXPORT_SYMBOL(open_exec);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
        .direct_IO      = exofs_direct_IO,
        /* With these NULL has special meaning or default is not exported */
-        .get_xip_mem    = NULL,
        .migratepage    = NULL,
        .launder_page   = NULL,
        .is_partially_uptodate = NULL,
@@ -1214,7 +1213,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
        }
-        inode->i_mapping->backing_dev_info = sb->s_bdi;
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &exofs_file_inode_operations;
                inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1312,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
        set_obj_2bcreated(oi);
-        inode->i_mapping->backing_dev_info = sb->s_bdi;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+        ret = bdi_setup_and_register(&sbi->bdi, "exofs");
        if (ret) {
                EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
                dput(sb->s_root);
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
-config EXT2_FS_XIP
-        bool "Ext2 execute in place support"
-        depends on EXT2_FS && MMU
-        help
-          Execute in place can be used on memory-backed block devices. If you
-          enable this option, you can select to mount block devices which are
-          capable of this feature without using the page cache.
-          If you do not use a block device that is capable of using this,
-          or if unsure, say N.
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index f42af45cfd88..445b0e996a12 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
 ext2-$(CONFIG_EXT2_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
 ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
 ext2-$(CONFIG_EXT2_FS_SECURITY)  += xattr_security.o
-ext2-$(CONFIG_EXT2_FS_XIP)       += xip.o
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e4279ead4a05..678f9ab08c48 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,10 +380,15 @@ struct ext2_inode {
 #define EXT2_MOUNT_NO_UID32             0x000200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER           0x004000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL            0x008000  /* POSIX Access Control Lists */
-#define EXT2_MOUNT_XIP                  0x010000  /* Execute in place */
+#define EXT2_MOUNT_XIP                  0x010000  /* Obsolete, use DAX */
 #define EXT2_MOUNT_USRQUOTA             0x020000  /* user quota */
 #define EXT2_MOUNT_GRPQUOTA             0x040000  /* group quota */
 #define EXT2_MOUNT_RESERVATION          0x080000  /* Preallocation */
+#ifdef CONFIG_FS_DAX
+#define EXT2_MOUNT_DAX                  0x100000  /* Direct Access */
+#else
+#define EXT2_MOUNT_DAX                  0
+#endif
 #define clear_opt(o, opt)               o &= ~EXT2_MOUNT_##opt
@@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
                      int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
-extern const struct file_operations ext2_xip_file_operations;
+extern const struct file_operations ext2_dax_file_operations;
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
-extern const struct address_space_operations ext2_aops_xip;
 extern const struct address_space_operations ext2_nobh_aops;
 /* namei.c */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..e31701713516 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
 #include "xattr.h"
 #include "acl.h"
+#ifdef CONFIG_FS_DAX
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_fault(vma, vmf, ext2_get_block);
+}
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+        .fault          = ext2_dax_fault,
+        .page_mkwrite   = ext2_dax_mkwrite,
+};
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        if (!IS_DAX(file_inode(file)))
+                return generic_file_mmap(file, vma);
+        file_accessed(file);
+        vma->vm_ops = &ext2_dax_vm_ops;
+        vma->vm_flags |= VM_MIXEDMAP;
+        return 0;
+}
+#else
+#define ext2_file_mmap  generic_file_mmap
+#endif
 /*
 * Called when filp is released. This happens when all file descriptors
 * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .mmap           = generic_file_mmap,
+        .mmap           = ext2_file_mmap,
        .open           = dquot_file_open,
        .release        = ext2_release_file,
        .fsync          = ext2_fsync,
@@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = {
        .splice_write   = iter_file_splice_write,
 };
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
-const struct file_operations ext2_xip_file_operations = {
+const struct file_operations ext2_dax_file_operations = {
        .llseek         = generic_file_llseek,
-        .read           = xip_file_read,
+        .read           = new_sync_read,
-        .write          = xip_file_write,
+        .write          = new_sync_write,
+        .read_iter      = generic_file_read_iter,
+        .write_iter     = generic_file_write_iter,
        .unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .mmap           = xip_file_mmap,
+        .mmap           = ext2_file_mmap,
        .open           = dquot_file_open,
        .release        = ext2_release_file,
        .fsync          = ext2_fsync,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
        struct ext2_group_desc * gdp;
        struct backing_dev_info *bdi;
-        bdi = inode->i_mapping->backing_dev_info;
+        bdi = inode_to_bdi(inode);
        if (bdi_read_congested(bdi))
                return;
        if (bdi_write_congested(bdi))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..6434bc000125 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -34,7 +34,6 @@
 #include <linux/aio.h>
 #include "ext2.h"
 #include "acl.h"
-#include "xip.h"
 #include "xattr.h"
 static int __ext2_write_inode(struct inode *inode, int do_sync);
@@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode,
                goto cleanup;
        }
-        if (ext2_use_xip(inode->i_sb)) {
+        if (IS_DAX(inode)) {
                /*
-                 * we need to clear the block
+                 * block must be initialised before we put it in the tree
+                 * so that it's not found by another thread before it's
+                 * initialised
                 */
-                err = ext2_clear_xip_target (inode,
+                err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
-                        le32_to_cpu(chain[depth-1].key));
+                                                1 << inode->i_blkbits);
                if (err) {
                        mutex_unlock(&ei->truncate_mutex);
                        goto cleanup;
@@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        ssize_t ret;
-        ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+        if (IS_DAX(inode))
+                ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+                                NULL, DIO_LOCKING);
+        else
+                ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                         ext2_get_block);
        if (ret < 0 && (rw & WRITE))
                ext2_write_failed(mapping, offset + count);
        return ret;
@@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = {
        .error_remove_page      = generic_error_remove_page,
 };
-const struct address_space_operations ext2_aops_xip = {
-        .bmap                   = ext2_bmap,
-        .get_xip_mem            = ext2_get_xip_mem,
-};
 const struct address_space_operations ext2_nobh_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
@@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
        inode_dio_wait(inode);
-        if (mapping_is_xip(inode->i_mapping))
+        if (IS_DAX(inode))
-                error = xip_truncate_page(inode->i_mapping, newsize);
+                error = dax_truncate_page(inode, newsize, ext2_get_block);
        else if (test_opt(inode->i_sb, NOBH))
                error = nobh_truncate_page(inode->i_mapping,
                                newsize, ext2_get_block);
@@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode)
 {
        unsigned int flags = EXT2_I(inode)->i_flags;
-        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+                                S_DIRSYNC | S_DAX);
        if (flags & EXT2_SYNC_FL)
                inode->i_flags |= S_SYNC;
        if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_NOATIME;
        if (flags & EXT2_DIRSYNC_FL)
                inode->i_flags |= S_DIRSYNC;
+        if (test_opt(inode->i_sb, DAX))
+                inode->i_flags |= S_DAX;
 }
 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext2_file_inode_operations;
-                if (ext2_use_xip(inode->i_sb)) {
+                if (test_opt(inode->i_sb, DAX)) {
-                        inode->i_mapping->a_ops = &ext2_aops_xip;
+                        inode->i_mapping->a_ops = &ext2_aops;
-                        inode->i_fop = &ext2_xip_file_operations;
+                        inode->i_fop = &ext2_dax_file_operations;
                } else if (test_opt(inode->i_sb, NOBH)) {
                        inode->i_mapping->a_ops = &ext2_nobh_aops;
                        inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c268d0af1db9..148f6e3789ea 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -35,7 +35,6 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "xip.h"
 static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
 {
@@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
                return PTR_ERR(inode);
        inode->i_op = &ext2_file_inode_operations;
-        if (ext2_use_xip(inode->i_sb)) {
+        if (test_opt(inode->i_sb, DAX)) {
-                inode->i_mapping->a_ops = &ext2_aops_xip;
+                inode->i_mapping->a_ops = &ext2_aops;
-                inode->i_fop = &ext2_xip_file_operations;
+                inode->i_fop = &ext2_dax_file_operations;
        } else if (test_opt(inode->i_sb, NOBH)) {
                inode->i_mapping->a_ops = &ext2_nobh_aops;
                inode->i_fop = &ext2_file_operations;
@@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                return PTR_ERR(inode);
        inode->i_op = &ext2_file_inode_operations;
-        if (ext2_use_xip(inode->i_sb)) {
+        if (test_opt(inode->i_sb, DAX)) {
-                inode->i_mapping->a_ops = &ext2_aops_xip;
+                inode->i_mapping->a_ops = &ext2_aops;
-                inode->i_fop = &ext2_xip_file_operations;
+                inode->i_fop = &ext2_dax_file_operations;
        } else if (test_opt(inode->i_sb, NOBH)) {
                inode->i_mapping->a_ops = &ext2_nobh_aops;
                inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ae55fddc26a9..d0e746e96511 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -35,7 +35,6 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "xip.h"
 static void ext2_sync_super(struct super_block *sb,
                            struct ext2_super_block *es, int wait);
@@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",grpquota");
 #endif
-#if defined(CONFIG_EXT2_FS_XIP)
+#ifdef CONFIG_FS_DAX
        if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
                seq_puts(seq, ",xip");
+        if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
+                seq_puts(seq, ",dax");
 #endif
        if (!test_opt(sb, RESERVATION))
@@ -403,7 +404,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
        Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
        Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
-        Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+        Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
        Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
 };
@@ -432,6 +433,7 @@ static const match_table_t tokens = {
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_xip, "xip"},
+        {Opt_dax, "dax"},
        {Opt_grpquota, "grpquota"},
        {Opt_ignore, "noquota"},
        {Opt_quota, "quota"},
@@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb)
                        break;
 #endif
                case Opt_xip:
-#ifdef CONFIG_EXT2_FS_XIP
+                        ext2_msg(sb, KERN_INFO, "use dax instead of xip");
-                        set_opt (sbi->s_mount_opt, XIP);
+                        set_opt(sbi->s_mount_opt, XIP);
+                        /* Fall through */
+                case Opt_dax:
+#ifdef CONFIG_FS_DAX
+                        set_opt(sbi->s_mount_opt, DAX);
 #else
-                        ext2_msg(sb, KERN_INFO, "xip option not supported");
+                        ext2_msg(sb, KERN_INFO, "dax option not supported");
 #endif
                        break;
@@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
                 MS_POSIXACL : 0);
-        ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
-                                    EXT2_MOUNT_XIP if not */
        if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
            (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-        if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
+        if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-                if (!silent)
+                if (blocksize != PAGE_SIZE) {
                        ext2_msg(sb, KERN_ERR,
-                                "error: unsupported blocksize for xip");
+                                        "error: unsupported blocksize for dax");
-                goto failed_mount;
+                        goto failed_mount;
+                }
+                if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                        ext2_msg(sb, KERN_ERR,
+                                        "error: device does not support dax");
+                        goto failed_mount;
+                }
        }
        /* If the blocksize doesn't match, re-read the thing.. */
@@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 {
        struct ext2_sb_info * sbi = EXT2_SB(sb);
        struct ext2_super_block * es;
-        unsigned long old_mount_opt = sbi->s_mount_opt;
        struct ext2_mount_options old_opts;
        unsigned long old_sb_flags;
        int err;
@@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
-        ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
-                                    EXT2_MOUNT_XIP if not */
-        if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
-                ext2_msg(sb, KERN_WARNING,
-                        "warning: unsupported blocksize for xip");
-                err = -EINVAL;
-                goto restore_opts;
-        }
        es = sbi->s_es;
-        if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
+        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
-                         "xip flag with busy inodes while remounting");
+                         "dax flag with busy inodes while remounting");
-                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
+                sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
-                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                spin_unlock(&sbi->s_lock);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
deleted file mode 100644
index e98171a11cfe..000000000000
--- a/fs/ext2/xip.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- *  linux/fs/ext2/xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include "ext2.h"
-#include "xip.h"
-static inline int
-__inode_direct_access(struct inode *inode, sector_t block,
-                      void **kaddr, unsigned long *pfn)
-{
-        struct block_device *bdev = inode->i_sb->s_bdev;
-        const struct block_device_operations *ops = bdev->bd_disk->fops;
-        sector_t sector;
-        sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
-        BUG_ON(!ops->direct_access);
-        return ops->direct_access(bdev, sector, kaddr, pfn);
-}
-static inline int
-__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
-                   sector_t *result)
-{
-        struct buffer_head tmp;
-        int rc;
-        memset(&tmp, 0, sizeof(struct buffer_head));
-        tmp.b_size = 1 << inode->i_blkbits;
-        rc = ext2_get_block(inode, pgoff, &tmp, create);
-        *result = tmp.b_blocknr;
-        /* did we get a sparse block (hole in the file)? */
-        if (!tmp.b_blocknr && !rc) {
-                BUG_ON(create);
-                rc = -ENODATA;
-        }
-        return rc;
-}
-int
-ext2_clear_xip_target(struct inode *inode, sector_t block)
-{
-        void *kaddr;
-        unsigned long pfn;
-        int rc;
-        rc = __inode_direct_access(inode, block, &kaddr, &pfn);
-        if (!rc)
-                clear_page(kaddr);
-        return rc;
-}
-void ext2_xip_verify_sb(struct super_block *sb)
-{
-        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
-            !sb->s_bdev->bd_disk->fops->direct_access) {
-                sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
-                ext2_msg(sb, KERN_WARNING,
-                             "warning: ignoring xip option - "
-                             "not supported by bdev");
-        }
-}
-int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
-                                void **kmem, unsigned long *pfn)
-{
-        int rc;
-        sector_t block;
-        /* first, retrieve the sector number */
-        rc = __ext2_get_block(mapping->host, pgoff, create, &block);
-        if (rc)
-                return rc;
-        /* retrieve address of the target data */
-        rc = __inode_direct_access(mapping->host, block, kmem, pfn);
-        return rc;
-}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
deleted file mode 100644
index 18b34d2f31b3..000000000000
--- a/fs/ext2/xip.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *  linux/fs/ext2/xip.h
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-#ifdef CONFIG_EXT2_FS_XIP
-extern void ext2_xip_verify_sb (struct super_block *);
-extern int ext2_clear_xip_target (struct inode *, sector_t);
-static inline int ext2_use_xip (struct super_block *sb)
-{
-        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
-}
-int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
-                                void **, unsigned long *);
-#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
-#else
-#define mapping_is_xip(map)                     0
-#define ext2_xip_verify_sb(sb)                  do { } while (0)
-#define ext2_use_xip(sb)                        0
-#define ext2_clear_xip_target(inode, chain)     0
-#define ext2_get_xip_mem                        NULL
-#endif
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9b4e7d750d4f..d4dbf3c259b3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static void ext3_put_super (struct super_block * sb)
        }
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
+        mutex_destroy(&sbi->s_orphan_lock);
+        mutex_destroy(&sbi->s_resize_lock);
        kfree(sbi);
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a75fba67bb1f..982d934fd9ac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -965,6 +965,11 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_MASK          0x00070
 #define EXT4_MOUNT_MINIX_DF             0x00080 /* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD               0x00100 /* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX                  0x00200 /* Direct Access */
+#else
+#define EXT4_MOUNT_DAX                  0
+#endif
 #define EXT4_MOUNT_DATA_FLAGS           0x00C00 /* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA         0x00400 /* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA         0x00800 /* Flush data before commit */
@@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern const struct file_operations ext4_dax_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* inline.c */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..33a09da16c9c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct inode *inode = file_inode(iocb->ki_filp);
        struct mutex *aio_mutex = NULL;
        struct blk_plug plug;
-        int o_direct = file->f_flags & O_DIRECT;
+        int o_direct = io_is_direct(file);
        int overwrite = 0;
        size_t length = iov_iter_count(from);
        ssize_t ret;
@@ -191,17 +191,41 @@ errout:
        return ret;
 }
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_fault(vma, vmf, ext4_get_block);
+                                        /* Is this the right get_block? */
+}
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+        .fault          = ext4_dax_fault,
+        .page_mkwrite   = ext4_dax_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops ext4_file_vm_ops
+#endif
 static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
-        vma->vm_ops = &ext4_file_vm_ops;
+        if (IS_DAX(file_inode(file))) {
+                vma->vm_ops = &ext4_dax_vm_ops;
+                vma->vm_flags |= VM_MIXEDMAP;
+        } else {
+                vma->vm_ops = &ext4_file_vm_ops;
+        }
        return 0;
 }
@@ -600,6 +624,26 @@ const struct file_operations ext4_file_operations = {
        .fallocate      = ext4_fallocate,
 };
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext4_dax_file_operations = {
+        .llseek         = ext4_llseek,
+        .read           = new_sync_read,
+        .write          = new_sync_write,
+        .read_iter      = generic_file_read_iter,
+        .write_iter     = ext4_file_write_iter,
+        .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ext4_compat_ioctl,
+#endif
+        .mmap           = ext4_file_mmap,
+        .open           = ext4_file_open,
+        .release        = ext4_release_file,
+        .fsync          = ext4_sync_file,
+        /* Splice not yet supported with DAX */
+        .fallocate      = ext4_fallocate,
+};
+#endif
 const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36b369697a13..6b9878a24182 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
                        inode_dio_done(inode);
                        goto locked;
                }
-                ret = __blockdev_direct_IO(rw, iocb, inode,
+                if (IS_DAX(inode))
-                                 inode->i_sb->s_bdev, iter, offset,
+                        ret = dax_do_io(rw, iocb, inode, iter, offset,
-                                 ext4_get_block, NULL, NULL, 0);
+                                        ext4_get_block, NULL, 0);
+                else
+                        ret = __blockdev_direct_IO(rw, iocb, inode,
+                                        inode->i_sb->s_bdev, iter, offset,
+                                        ext4_get_block, NULL, NULL, 0);
                inode_dio_done(inode);
        } else {
 locked:
-                ret = blockdev_direct_IO(rw, iocb, inode, iter,
+                if (IS_DAX(inode))
-                                 offset, ext4_get_block);
+                        ret = dax_do_io(rw, iocb, inode, iter, offset,
+                                        ext4_get_block, NULL, DIO_LOCKING);
+                else
+                        ret = blockdev_direct_IO(rw, iocb, inode, iter,
+                                        offset, ext4_get_block);
                if (unlikely((rw & WRITE) && ret < 0)) {
                        loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..85404f15e53a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
        return retval;
 }
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+        struct inode *inode = bh->b_assoc_map->host;
+        /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+        loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+        int err;
+        if (!uptodate)
+                return;
+        WARN_ON(!buffer_unwritten(bh));
+        err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+                if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+                        bh->b_assoc_map = inode->i_mapping;
+                        bh->b_private = (void *)(unsigned long)iblock;
+                        bh->b_end_io = ext4_end_io_unwritten;
+                }
                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                        set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                get_block_func = ext4_get_block_write;
                dio_flags = DIO_LOCKING;
        }
-        ret = __blockdev_direct_IO(rw, iocb, inode,
+        if (IS_DAX(inode))
-                                   inode->i_sb->s_bdev, iter,
+                ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
-                                   offset,
+                                ext4_end_io_dio, dio_flags);
-                                   get_block_func,
+        else
-                                   ext4_end_io_dio,
+                ret = __blockdev_direct_IO(rw, iocb, inode,
-                                   NULL,
+                                           inode->i_sb->s_bdev, iter, offset,
-                                   dio_flags);
+                                           get_block_func,
+                                           ext4_end_io_dio, NULL, dio_flags);
        /*
         * Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_aops;
 }
-/*
+static int __ext4_block_zero_page_range(handle_t *handle,
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, max, pos;
+        unsigned blocksize, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
                return -ENOMEM;
        blocksize = inode->i_sb->s_blocksize;
-        max = blocksize - (offset & (blocksize - 1));
-        /*
-         * correct length if it does not fall between
-         * 'from' and the end of the block
-         */
-        if (length > max || length < 0)
-                length = max;
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -3278,6 +3281,33 @@ unlock:
 }
 /*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length)
+{
+        struct inode *inode = mapping->host;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned blocksize = inode->i_sb->s_blocksize;
+        unsigned max = blocksize - (offset & (blocksize - 1));
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the block
+         */
+        if (length > max || length < 0)
+                length = max;
+        if (IS_DAX(inode))
+                return dax_zero_page_range(inode, from, length, ext4_get_block);
+        return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+/*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
 * This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
+        if (test_opt(inode->i_sb, DAX))
+                new_fl |= S_DAX;
        inode_set_flags(inode, new_fl,
-                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
-                inode->i_fop = &ext4_file_operations;
+                if (test_opt(inode->i_sb, DAX))
+                        inode->i_fop = &ext4_dax_file_operations;
+                else
+                        inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
@@ -4139,6 +4174,65 @@ static int ext4_inode_blocks_set(handle_t *handle,
        return 0;
 }
+struct other_inode {
+        unsigned long           orig_ino;
+        struct ext4_inode       *raw_inode;
+};
+static int other_inode_match(struct inode * inode, unsigned long ino,
+                             void *data)
+{
+        struct other_inode *oi = (struct other_inode *) data;
+        if ((inode->i_ino != ino) ||
+            (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+                               I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+            ((inode->i_state & I_DIRTY_TIME) == 0))
+                return 0;
+        spin_lock(&inode->i_lock);
+        if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+                                I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+            (inode->i_state & I_DIRTY_TIME)) {
+                struct ext4_inode_info  *ei = EXT4_I(inode);
+                inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+                spin_unlock(&inode->i_lock);
+                spin_lock(&ei->i_raw_lock);
+                EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+                EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+                EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+                ext4_inode_csum_set(inode, oi->raw_inode, ei);
+                spin_unlock(&ei->i_raw_lock);
+                trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+                return -1;
+        }
+        spin_unlock(&inode->i_lock);
+        return -1;
+}
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+                                          unsigned long orig_ino, char *buf)
+{
+        struct other_inode oi;
+        unsigned long ino;
+        int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+        int inode_size = EXT4_INODE_SIZE(sb);
+        oi.orig_ino = orig_ino;
+        ino = orig_ino & ~(inodes_per_block - 1);
+        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+                if (ino == orig_ino)
+                        continue;
+                oi.raw_inode = (struct ext4_inode *) buf;
+                (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+        }
+}
 /*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
@@ -4248,10 +4342,11 @@ static int ext4_do_update_inode(handle_t *handle,
                                cpu_to_le16(ei->i_extra_isize);
                }
        }
        ext4_inode_csum_set(inode, raw_inode, ei);
        spin_unlock(&ei->i_raw_lock);
+        if (inode->i_sb->s_flags & MS_LAZYTIME)
+                ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+                                              bh->b_data);
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -4534,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
-                        truncate_pagecache(inode, inode->i_size);
+                truncate_pagecache(inode, inode->i_size);
        }
        /*
         * We want to call ext4_truncate() even if attr->ia_size ==
@@ -4840,11 +4935,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * If the inode is marked synchronous, we don't honour that here - doing
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
 */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *handle;
+        if (flags == I_DIRTY_TIME)
+                return;
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..28fe71a2904c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
-                inode->i_fop = &ext4_file_operations;
+                if (test_opt(inode->i_sb, DAX))
+                        inode->i_fop = &ext4_dax_file_operations;
+                else
+                        inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                err = ext4_add_nondir(handle, dentry, inode);
                if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
-                inode->i_fop = &ext4_file_operations;
+                if (test_opt(inode->i_sb, DAX))
+                        inode->i_fop = &ext4_dax_file_operations;
+                else
+                        inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                d_tmpfile(dentry, inode);
                err = ext4_orphan_add(handle, inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 74c5f53595fb..1adac6868e6f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
 static int block_device_ejected(struct super_block *sb)
 {
        struct inode *bd_inode = sb->s_bdev->bd_inode;
-        struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
        return bdi->dev == NULL;
 }
@@ -1046,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         struct path *path);
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
-                                 int format_id);
 static int ext4_quota_off(struct super_block *sb, int type);
-static int ext4_quota_off_sysfile(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1084,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = {
        .get_dqblk      = dquot_get_dqblk,
        .set_dqblk      = dquot_set_dqblk
 };
-static const struct quotactl_ops ext4_qctl_sysfile_operations = {
-        .quota_on_meta  = ext4_quota_on_sysfile,
-        .quota_off      = ext4_quota_off_sysfile,
-        .quota_sync     = dquot_quota_sync,
-        .get_info       = dquot_get_dqinfo,
-        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = dquot_set_dqblk
-};
 #endif
 static const struct super_operations ext4_sops = {
@@ -1137,8 +1124,9 @@ enum {
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_usrquota, Opt_grpquota, Opt_i_version,
+        Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+        Opt_lazytime, Opt_nolazytime,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
@@ -1200,8 +1188,11 @@ static const match_table_t tokens = {
        {Opt_barrier, "barrier"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
+        {Opt_dax, "dax"},
        {Opt_stripe, "stripe=%u"},
        {Opt_delalloc, "delalloc"},
+        {Opt_lazytime, "lazytime"},
+        {Opt_nolazytime, "nolazytime"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_removed, "mblk_io_submit"},
        {Opt_removed, "nomblk_io_submit"},
@@ -1384,6 +1375,7 @@ static const struct mount_opts {
        {Opt_min_batch_time, 0, MOPT_GTE0},
        {Opt_inode_readahead_blks, 0, MOPT_GTE0},
        {Opt_init_itable, 0, MOPT_GTE0},
+        {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
        {Opt_stripe, 0, MOPT_GTE0},
        {Opt_resuid, 0, MOPT_GTE0},
        {Opt_resgid, 0, MOPT_GTE0},
@@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
        case Opt_i_version:
                sb->s_flags |= MS_I_VERSION;
                return 1;
+        case Opt_lazytime:
+                sb->s_flags |= MS_LAZYTIME;
+                return 1;
+        case Opt_nolazytime:
+                sb->s_flags &= ~MS_LAZYTIME;
+                return 1;
        }
        for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                }
                sbi->s_jquota_fmt = m->mount_opt;
 #endif
+#ifndef CONFIG_FS_DAX
+        } else if (token == Opt_dax) {
+                ext4_msg(sb, KERN_INFO, "dax option not supported");
+                return -1;
+#endif
        } else {
                if (!args->from)
                        arg = 1;
@@ -3602,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                 "both data=journal and dioread_nolock");
                        goto failed_mount;
                }
+                if (test_opt(sb, DAX)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and dax");
+                        goto failed_mount;
+                }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        }
@@ -3665,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+                if (blocksize != PAGE_SIZE) {
+                        ext4_msg(sb, KERN_ERR,
+                                        "error: unsupported blocksize for dax");
+                        goto failed_mount;
+                }
+                if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                        ext4_msg(sb, KERN_ERR,
+                                        "error: device does not support dax");
+                        goto failed_mount;
+                }
+        }
        if (sb->s_blocksize != blocksize) {
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
@@ -3935,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                sb->s_qcop = &ext4_qctl_sysfile_operations;
+                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -4882,6 +4903,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        err = -EINVAL;
                        goto restore_opts;
                }
+                if (test_opt(sb, DAX)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and dax");
+                        err = -EINVAL;
+                        goto restore_opts;
+                }
+        }
+        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+                ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+                        "dax flag with busy inodes while remounting");
+                sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
        }
        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
@@ -5020,6 +5053,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        }
 #endif
+        *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
        kfree(orig_data);
        return 0;
@@ -5288,21 +5322,6 @@ static int ext4_enable_quotas(struct super_block *sb)
        return 0;
 }
-/*
- * quota_on function that is used when QUOTA feature is set.
- */
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
-                                 int format_id)
-{
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                return -EINVAL;
-        /*
-         * USAGE was enabled at mount time. Only need to enable LIMITS now.
-         */
-        return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
-}
 static int ext4_quota_off(struct super_block *sb, int type)
 {
        struct inode *inode = sb_dqopt(sb)->files[type];
@@ -5329,18 +5348,6 @@ out:
        return dquot_quota_off(sb, type);
 }
-/*
- * quota_off function that is used when QUOTA feature is set.
- */
-static int ext4_quota_off_sysfile(struct super_block *sb, int type)
-{
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                return -EINVAL;
-        /* Disable only the limits. */
-        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and no one else should touch the files)
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 736a348509f7..94e2d2ffabe1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -71,3 +71,13 @@ config F2FS_CHECK_FS
          Enables BUG_ONs which check the filesystem consistency in runtime.
          If you want to improve the performance, say N.
+config F2FS_IO_TRACE
+        bool "F2FS IO tracer"
+        depends on F2FS_FS
+        depends on FUNCTION_TRACER
+        help
+          F2FS IO trace is based on a function trace, which gathers process
+          information and block IO patterns in the filesystem level.
+          If unsure, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2e35da12d292..d92397731db8 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -5,3 +5,4 @@ f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
+f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1ccb26bc2a0b..742202779bd5 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -62,7 +62,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
        if (count == 0)
                return NULL;
-        acl = posix_acl_alloc(count, GFP_KERNEL);
+        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
@@ -116,7 +116,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
        int i;
        f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
-                        sizeof(struct f2fs_acl_entry), GFP_KERNEL);
+                        sizeof(struct f2fs_acl_entry), GFP_NOFS);
        if (!f2fs_acl)
                return ERR_PTR(-ENOMEM);
@@ -396,7 +396,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
                posix_acl_release(default_acl);
        }
        if (acl) {
-                if (error)
+                if (!error)
                        error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
                                               ipage);
                posix_acl_release(acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index e6c271fefaca..7f794b72b3b7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,10 +20,11 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 static struct kmem_cache *ino_entry_slab;
-static struct kmem_cache *inode_entry_slab;
+struct kmem_cache *inode_entry_slab;
 /*
 * We guarantee no failure on the returned page.
@@ -50,6 +51,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
        struct address_space *mapping = META_MAPPING(sbi);
        struct page *page;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO,
+                .blk_addr = index,
+        };
 repeat:
        page = grab_cache_page(mapping, index);
        if (!page) {
@@ -59,8 +65,7 @@ repeat:
        if (PageUptodate(page))
                goto out;
-        if (f2fs_submit_page_bio(sbi, page, index,
+        if (f2fs_submit_page_bio(sbi, page, &fio))
-                                READ_SYNC | REQ_META | REQ_PRIO))
                goto repeat;
        lock_page(page);
@@ -112,14 +117,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
        block_t prev_blk_addr = 0;
        struct page *page;
        block_t blkno = start;
        struct f2fs_io_info fio = {
                .type = META,
                .rw = READ_SYNC | REQ_META | REQ_PRIO
        };
        for (; nrpages-- > 0; blkno++) {
-                block_t blk_addr;
                if (!is_valid_blkaddr(sbi, blkno, type))
                        goto out;
@@ -130,27 +133,27 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
                                        NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
                                blkno = 0;
                        /* get nat block addr */
-                        blk_addr = current_nat_addr(sbi,
+                        fio.blk_addr = current_nat_addr(sbi,
                                        blkno * NAT_ENTRY_PER_BLOCK);
                        break;
                case META_SIT:
                        /* get sit block addr */
-                        blk_addr = current_sit_addr(sbi,
+                        fio.blk_addr = current_sit_addr(sbi,
                                        blkno * SIT_ENTRY_PER_BLOCK);
-                        if (blkno != start && prev_blk_addr + 1 != blk_addr)
+                        if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
                                goto out;
-                        prev_blk_addr = blk_addr;
+                        prev_blk_addr = fio.blk_addr;
                        break;
                case META_SSA:
                case META_CP:
                case META_POR:
-                        blk_addr = blkno;
+                        fio.blk_addr = blkno;
                        break;
                default:
                        BUG();
                }
-                page = grab_cache_page(META_MAPPING(sbi), blk_addr);
+                page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
                if (!page)
                        continue;
                if (PageUptodate(page)) {
@@ -158,7 +161,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
                        continue;
                }
-                f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+                f2fs_submit_page_mbio(sbi, page, &fio);
                f2fs_put_page(page, 0);
        }
 out:
@@ -187,7 +190,7 @@ static int f2fs_write_meta_page(struct page *page,
        trace_f2fs_writepage(page, META);
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                goto redirty_out;
        if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
                goto redirty_out;
@@ -299,6 +302,8 @@ static int f2fs_set_meta_page_dirty(struct page *page)
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
+                SetPagePrivate(page);
+                f2fs_trace_pid(page);
                return 1;
        }
        return 0;
@@ -308,6 +313,8 @@ const struct address_space_operations f2fs_meta_aops = {
        .writepage      = f2fs_write_meta_page,
        .writepages     = f2fs_write_meta_pages,
        .set_page_dirty = f2fs_set_meta_page_dirty,
+        .invalidatepage = f2fs_invalidate_page,
+        .releasepage    = f2fs_release_page,
 };
 static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -462,7 +469,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
                return;
-        sbi->por_doing = true;
+        set_sbi_flag(sbi, SBI_POR_DOING);
        start_blk = __start_cp_addr(sbi) + 1 +
                le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
@@ -483,7 +490,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
        }
        /* clear Orphan Flag */
        clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
-        sbi->por_doing = false;
+        clear_sbi_flag(sbi, SBI_POR_DOING);
        return;
 }
@@ -567,7 +574,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp1;
-        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
+        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp1;
@@ -582,7 +589,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp2;
-        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
+        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp2;
@@ -669,7 +676,7 @@ fail_no_cp:
        return -EINVAL;
 }
-static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
+static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -686,7 +693,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 void update_dirty_page(struct inode *inode, struct page *page)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        struct dir_inode_entry *new;
+        struct inode_entry *new;
        int ret = 0;
        if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
@@ -710,12 +717,13 @@ void update_dirty_page(struct inode *inode, struct page *page)
                kmem_cache_free(inode_entry_slab, new);
 out:
        SetPagePrivate(page);
+        f2fs_trace_pid(page);
 }
 void add_dirty_dir_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        struct dir_inode_entry *new =
+        struct inode_entry *new =
                        f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
        int ret = 0;
@@ -733,7 +741,7 @@ void add_dirty_dir_inode(struct inode *inode)
 void remove_dirty_dir_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        struct dir_inode_entry *entry;
+        struct inode_entry *entry;
        if (!S_ISDIR(inode->i_mode))
                return;
@@ -763,7 +771,7 @@ void remove_dirty_dir_inode(struct inode *inode)
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
 {
        struct list_head *head;
-        struct dir_inode_entry *entry;
+        struct inode_entry *entry;
        struct inode *inode;
 retry:
        if (unlikely(f2fs_cp_error(sbi)))
@@ -776,7 +784,7 @@ retry:
                spin_unlock(&sbi->dir_inode_lock);
                return;
        }
-        entry = list_entry(head->next, struct dir_inode_entry, list);
+        entry = list_entry(head->next, struct inode_entry, list);
        inode = igrab(entry->inode);
        spin_unlock(&sbi->dir_inode_lock);
        if (inode) {
@@ -922,7 +930,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        ckpt->next_free_nid = cpu_to_le32(last_nid);
        /* 2 cp  + n data seg summary + orphan inode blocks */
-        data_sum_blocks = npages_for_summary_flush(sbi);
+        data_sum_blocks = npages_for_summary_flush(sbi, false);
        if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
                set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
        else
@@ -932,24 +940,31 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
                        orphan_blocks);
-        if (cpc->reason == CP_UMOUNT) {
+        if (__remain_node_summaries(cpc->reason))
-                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks + NR_CURSEG_NODE_TYPE);
-        } else {
+        else
-                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks);
-        }
+        if (cpc->reason == CP_UMOUNT)
+                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+        else
+                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+        if (cpc->reason == CP_FASTBOOT)
+                set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+        else
+                clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
        if (orphan_num)
                set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
        else
                clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
-        if (sbi->need_fsck)
+        if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
                set_ckpt_flags(ckpt, CP_FSCK_FLAG);
        /* update SIT/NAT bitmap */
@@ -966,15 +981,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        /* write out checkpoint buffer at block 0 */
        cp_page = grab_meta_page(sbi, start_blk++);
        kaddr = page_address(cp_page);
-        memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+        memcpy(kaddr, ckpt, F2FS_BLKSIZE);
        set_page_dirty(cp_page);
        f2fs_put_page(cp_page, 1);
        for (i = 1; i < 1 + cp_payload_blks; i++) {
                cp_page = grab_meta_page(sbi, start_blk++);
                kaddr = page_address(cp_page);
-                memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
+                memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
-                                (1 << sbi->log_blocksize));
                set_page_dirty(cp_page);
                f2fs_put_page(cp_page, 1);
        }
@@ -986,7 +1000,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        write_data_summaries(sbi, start_blk);
        start_blk += data_sum_blocks;
-        if (cpc->reason == CP_UMOUNT) {
+        if (__remain_node_summaries(cpc->reason)) {
                write_node_summaries(sbi, start_blk);
                start_blk += NR_CURSEG_NODE_TYPE;
        }
@@ -994,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        /* writeout checkpoint block */
        cp_page = grab_meta_page(sbi, start_blk);
        kaddr = page_address(cp_page);
-        memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+        memcpy(kaddr, ckpt, F2FS_BLKSIZE);
        set_page_dirty(cp_page);
        f2fs_put_page(cp_page, 1);
@@ -1023,7 +1037,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                return;
        clear_prefree_segments(sbi);
-        F2FS_RESET_SB_DIRT(sbi);
+        clear_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 /*
@@ -1038,10 +1052,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        mutex_lock(&sbi->cp_mutex);
-        if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
+        if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
+                        cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT)
                goto out;
        if (unlikely(f2fs_cp_error(sbi)))
                goto out;
+        if (f2fs_readonly(sbi->sb))
+                goto out;
        if (block_operations(sbi))
                goto out;
@@ -1102,8 +1119,8 @@ int __init create_checkpoint_caches(void)
                        sizeof(struct ino_entry));
        if (!ino_entry_slab)
                return -ENOMEM;
-        inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
+        inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
-                        sizeof(struct dir_inode_entry));
+                        sizeof(struct inode_entry));
        if (!inode_entry_slab) {
                kmem_cache_destroy(ino_entry_slab);
                return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7ec697b37f19..985ed023a750 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -22,6 +22,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 static void f2fs_read_end_io(struct bio *bio, int err)
@@ -95,11 +96,9 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
                return;
        if (is_read_io(fio->rw))
-                trace_f2fs_submit_read_bio(io->sbi->sb, fio->rw,
+                trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
-                                                        fio->type, io->bio);
        else
-                trace_f2fs_submit_write_bio(io->sbi->sb, fio->rw,
+                trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
-                                                        fio->type, io->bio);
        submit_bio(fio->rw, io->bio);
        io->bio = NULL;
@@ -132,14 +131,15 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 * Return unlocked page.
 */
 int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
-                                        block_t blk_addr, int rw)
+                                        struct f2fs_io_info *fio)
 {
        struct bio *bio;
-        trace_f2fs_submit_page_bio(page, blk_addr, rw);
+        trace_f2fs_submit_page_bio(page, fio);
+        f2fs_trace_ios(page, fio, 0);
        /* Allocate a new bio */
-        bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+        bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
                bio_put(bio);
@@ -147,12 +147,12 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
                return -EFAULT;
        }
-        submit_bio(rw, bio);
+        submit_bio(fio->rw, bio);
        return 0;
 }
 void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
-                        block_t blk_addr, struct f2fs_io_info *fio)
+                                        struct f2fs_io_info *fio)
 {
        enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
        struct f2fs_bio_info *io;
@@ -160,21 +160,21 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
        io = is_read ? &sbi->read_io : &sbi->write_io[btype];
-        verify_block_addr(sbi, blk_addr);
+        verify_block_addr(sbi, fio->blk_addr);
        down_write(&io->io_rwsem);
        if (!is_read)
                inc_page_count(sbi, F2FS_WRITEBACK);
-        if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+        if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
                                                io->fio.rw != fio->rw))
                __submit_merged_bio(io);
 alloc_new:
        if (io->bio == NULL) {
                int bio_blocks = MAX_BIO_BLOCKS(sbi);
-                io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+                io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
                io->fio = *fio;
        }
@@ -184,10 +184,11 @@ alloc_new:
                goto alloc_new;
        }
-        io->last_block_in_bio = blk_addr;
+        io->last_block_in_bio = fio->blk_addr;
+        f2fs_trace_ios(page, fio, 0);
        up_write(&io->io_rwsem);
-        trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+        trace_f2fs_submit_page_mbio(page, fio);
 }
 /*
@@ -196,7 +197,7 @@ alloc_new:
 *  ->node_page
 *    update block addresses in the node page
 */
-static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
+static void __set_data_blkaddr(struct dnode_of_data *dn)
 {
        struct f2fs_node *rn;
        __le32 *addr_array;
@@ -209,7 +210,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
        /* Get physical address of data block */
        addr_array = blkaddr_in_node(rn);
-        addr_array[ofs_in_node] = cpu_to_le32(new_addr);
+        addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
        set_page_dirty(node_page);
 }
@@ -224,8 +225,8 @@ int reserve_new_block(struct dnode_of_data *dn)
        trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
-        __set_data_blkaddr(dn, NEW_ADDR);
        dn->data_blkaddr = NEW_ADDR;
+        __set_data_blkaddr(dn);
        mark_inode_dirty(dn->inode);
        sync_inode_page(dn);
        return 0;
@@ -273,7 +274,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                unsigned int blkbits = inode->i_sb->s_blocksize_bits;
                size_t count;
-                clear_buffer_new(bh_result);
+                set_buffer_new(bh_result);
                map_bh(bh_result, inode->i_sb,
                                start_blkaddr + pgofs - start_fofs);
                count = end_fofs - pgofs + 1;
@@ -290,23 +291,24 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
        return 0;
 }
-void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+void update_extent_cache(struct dnode_of_data *dn)
 {
        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        pgoff_t fofs, start_fofs, end_fofs;
        block_t start_blkaddr, end_blkaddr;
        int need_update = true;
-        f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
+        f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
-                                                        dn->ofs_in_node;
        /* Update the page address in the parent node */
-        __set_data_blkaddr(dn, blk_addr);
+        __set_data_blkaddr(dn);
        if (is_inode_flag_set(fi, FI_NO_EXTENT))
                return;
+        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+                                                        dn->ofs_in_node;
        write_lock(&fi->ext.ext_lock);
        start_fofs = fi->ext.fofs;
@@ -320,16 +322,16 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        /* Initial extent */
        if (fi->ext.len == 0) {
-                if (blk_addr != NULL_ADDR) {
+                if (dn->data_blkaddr != NULL_ADDR) {
                        fi->ext.fofs = fofs;
-                        fi->ext.blk_addr = blk_addr;
+                        fi->ext.blk_addr = dn->data_blkaddr;
                        fi->ext.len = 1;
                }
                goto end_update;
        }
        /* Front merge */
-        if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
+        if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) {
                fi->ext.fofs--;
                fi->ext.blk_addr--;
                fi->ext.len++;
@@ -337,7 +339,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        }
        /* Back merge */
-        if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
+        if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) {
                fi->ext.len++;
                goto end_update;
        }
@@ -376,6 +378,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
        struct dnode_of_data dn;
        struct page *page;
        int err;
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = sync ? READ_SYNC : READA,
+        };
        page = find_get_page(mapping, index);
        if (page && PageUptodate(page))
@@ -404,8 +410,8 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return page;
        }
-        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
+        fio.blk_addr = dn.data_blkaddr;
-                                        sync ? READ_SYNC : READA);
+        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
        if (err)
                return ERR_PTR(err);
@@ -430,7 +436,10 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
        struct dnode_of_data dn;
        struct page *page;
        int err;
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = READ_SYNC,
+        };
 repeat:
        page = grab_cache_page(mapping, index);
        if (!page)
@@ -464,8 +473,8 @@ repeat:
                return page;
        }
-        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+        fio.blk_addr = dn.data_blkaddr;
-                                        dn.data_blkaddr, READ_SYNC);
+        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
        if (err)
                return ERR_PTR(err);
@@ -515,8 +524,12 @@ repeat:
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                SetPageUptodate(page);
        } else {
-                err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+                struct f2fs_io_info fio = {
-                                                dn.data_blkaddr, READ_SYNC);
+                        .type = DATA,
+                        .rw = READ_SYNC,
+                        .blk_addr = dn.data_blkaddr,
+                };
+                err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
                if (err)
                        goto put_err;
@@ -550,30 +563,25 @@ static int __allocate_data_block(struct dnode_of_data *dn)
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        struct f2fs_summary sum;
-        block_t new_blkaddr;
        struct node_info ni;
+        int seg = CURSEG_WARM_DATA;
        pgoff_t fofs;
-        int type;
        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return -EPERM;
        if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
                return -ENOSPC;
-        __set_data_blkaddr(dn, NEW_ADDR);
-        dn->data_blkaddr = NEW_ADDR;
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-        type = CURSEG_WARM_DATA;
+        if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
+                seg = CURSEG_DIRECT_IO;
-        allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
+        allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg);
        /* direct IO doesn't use extent cache to maximize the performance */
-        set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+        __set_data_blkaddr(dn);
-        update_extent_cache(new_blkaddr, dn);
-        clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
        /* update i_size */
        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -581,10 +589,59 @@ static int __allocate_data_block(struct dnode_of_data *dn)
        if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
                i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
-        dn->data_blkaddr = new_blkaddr;
        return 0;
 }
+static void __allocate_data_blocks(struct inode *inode, loff_t offset,
+                                                        size_t count)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct dnode_of_data dn;
+        u64 start = F2FS_BYTES_TO_BLK(offset);
+        u64 len = F2FS_BYTES_TO_BLK(count);
+        bool allocated;
+        u64 end_offset;
+        while (len) {
+                f2fs_balance_fs(sbi);
+                f2fs_lock_op(sbi);
+                /* When reading holes, we need its node page */
+                set_new_dnode(&dn, inode, NULL, NULL, 0);
+                if (get_dnode_of_data(&dn, start, ALLOC_NODE))
+                        goto out;
+                allocated = false;
+                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+                while (dn.ofs_in_node < end_offset && len) {
+                        if (dn.data_blkaddr == NULL_ADDR) {
+                                if (__allocate_data_block(&dn))
+                                        goto sync_out;
+                                allocated = true;
+                        }
+                        len--;
+                        start++;
+                        dn.ofs_in_node++;
+                }
+                if (allocated)
+                        sync_inode_page(&dn);
+                f2fs_put_dnode(&dn);
+                f2fs_unlock_op(sbi);
+        }
+        return;
+sync_out:
+        if (allocated)
+                sync_inode_page(&dn);
+        f2fs_put_dnode(&dn);
+out:
+        f2fs_unlock_op(sbi);
+        return;
+}
 /*
 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
 * If original data blocks are allocated, then give them to blockdev.
@@ -610,10 +667,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
        if (check_extent_cache(inode, pgofs, bh_result))
                goto out;
-        if (create) {
+        if (create)
-                f2fs_balance_fs(F2FS_I_SB(inode));
                f2fs_lock_op(F2FS_I_SB(inode));
-        }
        /* When reading holes, we need its node page */
        set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -627,12 +682,14 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
                goto put_out;
        if (dn.data_blkaddr != NULL_ADDR) {
+                set_buffer_new(bh_result);
                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
        } else if (create) {
                err = __allocate_data_block(&dn);
                if (err)
                        goto put_out;
                allocated = true;
+                set_buffer_new(bh_result);
                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
        } else {
                goto put_out;
@@ -745,7 +802,6 @@ static int f2fs_read_data_pages(struct file *file,
 int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
 {
        struct inode *inode = page->mapping->host;
-        block_t old_blkaddr, new_blkaddr;
        struct dnode_of_data dn;
        int err = 0;
@@ -754,10 +810,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
        if (err)
                return err;
-        old_blkaddr = dn.data_blkaddr;
+        fio->blk_addr = dn.data_blkaddr;
        /* This page is already truncated */
-        if (old_blkaddr == NULL_ADDR)
+        if (fio->blk_addr == NULL_ADDR)
                goto out_writepage;
        set_page_writeback(page);
@@ -766,14 +822,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-        if (unlikely(old_blkaddr != NEW_ADDR &&
+        if (unlikely(fio->blk_addr != NEW_ADDR &&
                        !is_cold_data(page) &&
                        need_inplace_update(inode))) {
-                rewrite_data_page(page, old_blkaddr, fio);
+                rewrite_data_page(page, fio);
                set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
        } else {
-                write_data_page(page, &dn, &new_blkaddr, fio);
+                write_data_page(page, &dn, fio);
-                update_extent_cache(new_blkaddr, &dn);
+                update_extent_cache(&dn);
                set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
        }
 out_writepage:
@@ -812,7 +868,12 @@ static int f2fs_write_data_page(struct page *page,
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 write:
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+                goto redirty_out;
+        if (f2fs_is_drop_cache(inode))
+                goto out;
+        if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
+                        available_free_memory(sbi, BASE_CHECK))
                goto redirty_out;
        /* Dentry blocks are controlled by checkpoint */
@@ -826,7 +887,6 @@ write:
        /* we should bypass data pages to proceed the kworkder jobs */
        if (unlikely(f2fs_cp_error(sbi))) {
                SetPageError(page);
-                unlock_page(page);
                goto out;
        }
@@ -1002,8 +1062,12 @@ put_next:
        if (dn.data_blkaddr == NEW_ADDR) {
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
        } else {
-                err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+                struct f2fs_io_info fio = {
-                                           READ_SYNC);
+                        .type = DATA,
+                        .rw = READ_SYNC,
+                        .blk_addr = dn.data_blkaddr,
+                };
+                err = f2fs_submit_page_bio(sbi, page, &fio);
                if (err)
                        goto fail;
@@ -1092,6 +1156,9 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
        trace_f2fs_direct_IO_enter(inode, offset, count, rw);
+        if (rw & WRITE)
+                __allocate_data_blocks(inode, offset, count);
        err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
        if (err < 0 && (rw & WRITE))
                f2fs_write_failed(mapping, offset + count);
@@ -1101,24 +1168,33 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
        return err;
 }
-static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
-                                      unsigned int length)
+                                                        unsigned int length)
 {
        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+        if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
+                (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
                return;
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+        if (PageDirty(page)) {
-                invalidate_inmem_page(inode, page);
+                if (inode->i_ino == F2FS_META_INO(sbi))
+                        dec_page_count(sbi, F2FS_DIRTY_META);
-        if (PageDirty(page))
+                else if (inode->i_ino == F2FS_NODE_INO(sbi))
-                inode_dec_dirty_pages(inode);
+                        dec_page_count(sbi, F2FS_DIRTY_NODES);
+                else
+                        inode_dec_dirty_pages(inode);
+        }
        ClearPagePrivate(page);
 }
-static int f2fs_release_data_page(struct page *page, gfp_t wait)
+int f2fs_release_page(struct page *page, gfp_t wait)
 {
+        /* If this is dirty page, keep PagePrivate */
+        if (PageDirty(page))
+                return 0;
        ClearPagePrivate(page);
        return 1;
 }
@@ -1132,7 +1208,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
        SetPageUptodate(page);
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) {
+        if (f2fs_is_atomic_file(inode)) {
                register_inmem_page(inode, page);
                return 1;
        }
@@ -1168,8 +1244,8 @@ const struct address_space_operations f2fs_dblock_aops = {
        .write_begin    = f2fs_write_begin,
        .write_end      = f2fs_write_end,
        .set_page_dirty = f2fs_set_data_page_dirty,
-        .invalidatepage = f2fs_invalidate_data_page,
+        .invalidatepage = f2fs_invalidate_page,
-        .releasepage    = f2fs_release_data_page,
+        .releasepage    = f2fs_release_page,
        .direct_IO      = f2fs_direct_IO,
        .bmap           = f2fs_bmap,
 };
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 91e8f699ab30..e671373cc8ab 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -40,6 +40,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->ndirty_dirs = sbi->n_dirty_dirs;
        si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
        si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
+        si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
        si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
        si->rsvd_segs = reserved_segments(sbi);
        si->overp_segs = overprovision_segments(sbi);
@@ -57,7 +58,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->node_pages = NODE_MAPPING(sbi)->nrpages;
        si->meta_pages = META_MAPPING(sbi)->nrpages;
        si->nats = NM_I(sbi)->nat_cnt;
-        si->sits = SIT_I(sbi)->dirty_sentries;
+        si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
+        si->sits = MAIN_SEGS(sbi);
+        si->dirty_sits = SIT_I(sbi)->dirty_sentries;
        si->fnids = NM_I(sbi)->fcnt;
        si->bg_gc = sbi->bg_gc;
        si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -79,6 +82,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
                si->segment_count[i] = sbi->segment_count[i];
                si->block_count[i] = sbi->block_count[i];
        }
+        si->inplace_count = atomic_read(&sbi->inplace_count);
 }
 /*
@@ -137,6 +142,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
        si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
        si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+        si->base_mem += SIT_VBLOCK_MAP_SIZE;
        if (sbi->segs_per_sec > 1)
                si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
        si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -159,20 +165,32 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        si->base_mem += sizeof(struct f2fs_nm_info);
        si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+get_cache:
+        si->cache_mem = 0;
        /* build gc */
-        si->base_mem += sizeof(struct f2fs_gc_kthread);
+        if (sbi->gc_thread)
+                si->cache_mem += sizeof(struct f2fs_gc_kthread);
+        /* build merge flush thread */
+        if (SM_I(sbi)->cmd_control_info)
+                si->cache_mem += sizeof(struct flush_cmd_control);
-get_cache:
        /* free nids */
-        si->cache_mem = NM_I(sbi)->fcnt;
+        si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
-        si->cache_mem += NM_I(sbi)->nat_cnt;
+        si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
-        npages = NODE_MAPPING(sbi)->nrpages;
+        si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
-        si->cache_mem += npages << PAGE_CACHE_SHIFT;
+                                        sizeof(struct nat_entry_set);
-        npages = META_MAPPING(sbi)->nrpages;
+        si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-        si->cache_mem += npages << PAGE_CACHE_SHIFT;
+        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
-        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
        for (i = 0; i <= UPDATE_INO; i++)
                si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+        si->page_mem = 0;
+        npages = NODE_MAPPING(sbi)->nrpages;
+        si->page_mem += npages << PAGE_CACHE_SHIFT;
+        npages = META_MAPPING(sbi)->nrpages;
+        si->page_mem += npages << PAGE_CACHE_SHIFT;
 }
 static int stat_show(struct seq_file *s, void *v)
@@ -250,16 +268,16 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
                           si->hit_ext, si->total_ext);
                seq_puts(s, "\nBalancing F2FS Async:\n");
-                seq_printf(s, "  - inmem: %4d\n",
+                seq_printf(s, "  - inmem: %4d, wb: %4d\n",
-                           si->inmem_pages);
+                           si->inmem_pages, si->wb_pages);
                seq_printf(s, "  - nodes: %4d in %4d\n",
                           si->ndirty_node, si->node_pages);
                seq_printf(s, "  - dents: %4d in dirs:%4d\n",
                           si->ndirty_dent, si->ndirty_dirs);
                seq_printf(s, "  - meta: %4d in %4d\n",
                           si->ndirty_meta, si->meta_pages);
-                seq_printf(s, "  - NATs: %9d\n  - SITs: %9d\n",
+                seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
-                           si->nats, si->sits);
+                           si->dirty_nats, si->nats, si->dirty_sits, si->sits);
                seq_printf(s, "  - free_nids: %9d\n",
                           si->fnids);
                seq_puts(s, "\nDistribution of User Blocks:");
@@ -277,6 +295,7 @@ static int stat_show(struct seq_file *s, void *v)
                for (j = 0; j < si->util_free; j++)
                        seq_putc(s, '-');
                seq_puts(s, "]\n\n");
+                seq_printf(s, "IPU: %u blocks\n", si->inplace_count);
                seq_printf(s, "SSR: %u blocks in %u segments\n",
                           si->block_count[SSR], si->segment_count[SSR]);
                seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -289,9 +308,14 @@ static int stat_show(struct seq_file *s, void *v)
                /* memory footprint */
                update_mem_info(si->sbi);
-                seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
+                seq_printf(s, "\nMemory: %u KB\n",
-                                (si->base_mem + si->cache_mem) >> 10,
+                        (si->base_mem + si->cache_mem + si->page_mem) >> 10);
-                                si->base_mem >> 10, si->cache_mem >> 10);
+                seq_printf(s, "  - static: %u KB\n",
+                                si->base_mem >> 10);
+                seq_printf(s, "  - cached: %u KB\n",
+                                si->cache_mem >> 10);
+                seq_printf(s, "  - paged : %u KB\n",
+                                si->page_mem >> 10);
        }
        mutex_unlock(&f2fs_stat_mutex);
        return 0;
@@ -331,6 +355,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
        atomic_set(&sbi->inline_inode, 0);
        atomic_set(&sbi->inline_dir, 0);
+        atomic_set(&sbi->inplace_count, 0);
        mutex_lock(&f2fs_stat_mutex);
        list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b1a7d5737cd0..b74097a7f6d9 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -286,8 +286,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
        f2fs_wait_on_page_writeback(page, type);
        de->ino = cpu_to_le32(inode->i_ino);
        set_de_type(de, inode);
-        if (!f2fs_has_inline_dentry(dir))
+        f2fs_dentry_kunmap(dir, page);
-                kunmap(page);
        set_page_dirty(page);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        mark_inode_dirty(dir);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec58bb2373fc..7fa3313ab0e2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -28,7 +28,7 @@
        do {                                                            \
                if (unlikely(condition)) {                              \
                        WARN_ON(1);                                     \
-                        sbi->need_fsck = true;                          \
+                        set_sbi_flag(sbi, SBI_NEED_FSCK);               \
                }                                                       \
        } while (0)
 #define f2fs_down_write(x, y)   down_write(x)
@@ -100,10 +100,15 @@ enum {
 enum {
        CP_UMOUNT,
+        CP_FASTBOOT,
        CP_SYNC,
        CP_DISCARD,
 };
+#define DEF_BATCHED_TRIM_SECTIONS       32
+#define BATCHED_TRIM_SEGMENTS(sbi)      \
+                (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
 struct cp_control {
        int reason;
        __u64 trim_start;
@@ -136,8 +141,14 @@ struct ino_entry {
        nid_t ino;              /* inode number */
 };
-/* for the list of directory inodes */
+/*
-struct dir_inode_entry {
+ * for the list of directory inodes or gc inodes.
+ * NOTE: there are two slab users for this structure, if we add/modify/delete
+ * fields in structure for one of slab users, it may affect fields or size of
+ * other one, in this condition, it's better to split both of slab and related
+ * data structure.
+ */
+struct inode_entry {
        struct list_head list;  /* list head */
        struct inode *inode;    /* vfs inode pointer */
 };
@@ -196,11 +207,14 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 */
 #define F2FS_IOC_GETFLAGS               FS_IOC_GETFLAGS
 #define F2FS_IOC_SETFLAGS               FS_IOC_SETFLAGS
+#define F2FS_IOC_GETVERSION             FS_IOC_GETVERSION
 #define F2FS_IOCTL_MAGIC                0xf5
 #define F2FS_IOC_START_ATOMIC_WRITE     _IO(F2FS_IOCTL_MAGIC, 1)
 #define F2FS_IOC_COMMIT_ATOMIC_WRITE    _IO(F2FS_IOCTL_MAGIC, 2)
 #define F2FS_IOC_START_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 3)
+#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
+#define F2FS_IOC_ABORT_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 5)
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -295,7 +309,7 @@ struct f2fs_inode_info {
        nid_t i_xattr_nid;              /* node id that contains xattrs */
        unsigned long long xattr_ver;   /* cp version of xattr modification */
        struct extent_info ext;         /* in-memory extent cache entry */
-        struct dir_inode_entry *dirty_dir;      /* the pointer of dirty dir */
+        struct inode_entry *dirty_dir;  /* the pointer of dirty dir */
        struct radix_tree_root inmem_root;      /* radix tree for inmem pages */
        struct list_head inmem_pages;   /* inmemory pages managed by f2fs */
@@ -398,7 +412,8 @@ enum {
        CURSEG_HOT_NODE,        /* direct node blocks of directory files */
        CURSEG_WARM_NODE,       /* direct node blocks of normal files */
        CURSEG_COLD_NODE,       /* indirect node blocks */
-        NO_CHECK_TYPE
+        NO_CHECK_TYPE,
+        CURSEG_DIRECT_IO,       /* to use for the direct IO path */
 };
 struct flush_cmd {
@@ -437,6 +452,9 @@ struct f2fs_sm_info {
        int nr_discards;                        /* # of discards in the list */
        int max_discards;                       /* max. discards to be issued */
+        /* for batched trimming */
+        unsigned int trim_sections;             /* # of sections to trim */
        struct list_head sit_entry_set; /* sit entry set list */
        unsigned int ipu_policy;        /* in-place-update policy */
@@ -489,6 +507,7 @@ enum page_type {
 struct f2fs_io_info {
        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
        int rw;                 /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+        block_t blk_addr;       /* block address to be written */
 };
 #define is_read_io(rw)  (((rw) & 1) == READ)
@@ -508,13 +527,20 @@ struct inode_management {
        unsigned long ino_num;                  /* number of entries */
 };
+/* For s_flag in struct f2fs_sb_info */
+enum {
+        SBI_IS_DIRTY,                           /* dirty flag for checkpoint */
+        SBI_IS_CLOSE,                           /* specify unmounting */
+        SBI_NEED_FSCK,                          /* need fsck.f2fs to fix */
+        SBI_POR_DOING,                          /* recovery is doing or not */
+};
 struct f2fs_sb_info {
        struct super_block *sb;                 /* pointer to VFS super block */
        struct proc_dir_entry *s_proc;          /* proc entry */
        struct buffer_head *raw_super_buf;      /* buffer head of raw sb */
        struct f2fs_super_block *raw_super;     /* raw super block pointer */
-        int s_dirty;                            /* dirty flag for checkpoint */
+        int s_flag;                             /* flags for sbi */
-        bool need_fsck;                         /* need fsck.f2fs to fix */
        /* for node-related operations */
        struct f2fs_nm_info *nm_info;           /* node manager */
@@ -534,7 +560,6 @@ struct f2fs_sb_info {
        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
        struct rw_semaphore node_write;         /* locking node writes */
        struct mutex writepages;                /* mutex for writepages() */
-        bool por_doing;                         /* recovery is doing or not */
        wait_queue_head_t cp_wait;
        struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
@@ -589,6 +614,7 @@ struct f2fs_sb_info {
        struct f2fs_stat_info *stat_info;       /* FS status information */
        unsigned int segment_count[2];          /* # of allocated segments */
        unsigned int block_count[2];            /* # of allocated blocks */
+        atomic_t inplace_count;         /* # of inplace update */
        int total_hit_ext, read_hit_ext;        /* extent cache hit ratio */
        atomic_t inline_inode;                  /* # of inline_data inodes */
        atomic_t inline_dir;                    /* # of inline_dentry inodes */
@@ -686,14 +712,19 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
        return sbi->node_inode->i_mapping;
 }
-static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
 {
-        sbi->s_dirty = 1;
+        return sbi->s_flag & (0x01 << type);
 }
-static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
 {
-        sbi->s_dirty = 0;
+        sbi->s_flag |= (0x01 << type);
+}
+static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
+{
+        sbi->s_flag &= ~(0x01 << type);
 }
 static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -741,6 +772,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
        up_write(&sbi->cp_rwsem);
 }
+static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
+{
+        int reason = CP_SYNC;
+        if (test_opt(sbi, FASTBOOT))
+                reason = CP_FASTBOOT;
+        if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
+                reason = CP_UMOUNT;
+        return reason;
+}
+static inline bool __remain_node_summaries(int reason)
+{
+        return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+}
+static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
+{
+        return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
+                        is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+}
 /*
 * Check whether the given nid is within node id range.
 */
@@ -805,7 +858,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
        atomic_inc(&sbi->nr_pages[count_type]);
-        F2FS_SET_SB_DIRT(sbi);
+        set_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -1113,6 +1166,7 @@ enum {
        FI_NEED_IPU,            /* used for ipu per file */
        FI_ATOMIC_FILE,         /* indicate atomic file */
        FI_VOLATILE_FILE,       /* indicate volatile file */
+        FI_DROP_CACHE,          /* drop dirty page cache */
        FI_DATA_EXIST,          /* indicate data exists */
 };
@@ -1220,6 +1274,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
        return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
 }
+static inline bool f2fs_is_drop_cache(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
+}
 static inline void *inline_data_addr(struct page *page)
 {
        struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1389,7 +1448,6 @@ void destroy_node_manager_caches(void);
 * segment.c
 */
 void register_inmem_page(struct inode *, struct page *);
-void invalidate_inmem_page(struct inode *, struct page *);
 void commit_inmem_pages(struct inode *, bool);
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
@@ -1401,16 +1459,16 @@ void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *);
 void release_discard_addrs(struct f2fs_sb_info *);
 void discard_next_dnode(struct f2fs_sb_info *, block_t);
-int npages_for_summary_flush(struct f2fs_sb_info *);
+int npages_for_summary_flush(struct f2fs_sb_info *, bool);
 void allocate_new_segments(struct f2fs_sb_info *);
 int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *,
-                struct f2fs_io_info *, unsigned int, block_t, block_t *);
+                                unsigned int, struct f2fs_io_info *);
-void write_data_page(struct page *, struct dnode_of_data *, block_t *,
+void write_data_page(struct page *, struct dnode_of_data *,
-                                        struct f2fs_io_info *);
+                        struct f2fs_io_info *);
-void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
+void rewrite_data_page(struct page *, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
@@ -1457,17 +1515,20 @@ void destroy_checkpoint_caches(void);
 * data.c
 */
 void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
-void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
+                                                struct f2fs_io_info *);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
                                                struct f2fs_io_info *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void update_extent_cache(block_t, struct dnode_of_data *);
+void update_extent_cache(struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct page *, struct f2fs_io_info *);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
+int f2fs_release_page(struct page *, gfp_t);
 /*
 * gc.c
@@ -1477,8 +1538,6 @@ void stop_gc_thread(struct f2fs_sb_info *);
 block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
 int f2fs_gc(struct f2fs_sb_info *);
 void build_gc_manager(struct f2fs_sb_info *);
-int __init create_gc_caches(void);
-void destroy_gc_caches(void);
 /*
 * recovery.c
@@ -1497,9 +1556,9 @@ struct f2fs_stat_info {
        int main_area_segs, main_area_sections, main_area_zones;
        int hit_ext, total_ext;
        int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
-        int nats, sits, fnids;
+        int nats, dirty_nats, sits, dirty_sits, fnids;
        int total_count, utilization;
-        int bg_gc, inline_inode, inline_dir, inmem_pages;
+        int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
        unsigned int valid_count, valid_node_count, valid_inode_count;
        unsigned int bimodal, avg_vblocks;
        int util_free, util_valid, util_invalid;
@@ -1514,7 +1573,8 @@ struct f2fs_stat_info {
        unsigned int segment_count[2];
        unsigned int block_count[2];
-        unsigned base_mem, cache_mem;
+        unsigned int inplace_count;
+        unsigned base_mem, cache_mem, page_mem;
 };
 static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1553,7 +1613,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
                ((sbi)->segment_count[(curseg)->alloc_type]++)
 #define stat_inc_block_count(sbi, curseg)                               \
                ((sbi)->block_count[(curseg)->alloc_type]++)
+#define stat_inc_inplace_blocks(sbi)                                    \
+                (atomic_inc(&(sbi)->inplace_count))
 #define stat_inc_seg_count(sbi, type)                                   \
        do {                                                            \
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
@@ -1599,6 +1660,7 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_inline_dir(inode)
 #define stat_inc_seg_type(sbi, curseg)
 #define stat_inc_block_count(sbi, curseg)
+#define stat_inc_inplace_blocks(sbi)
 #define stat_inc_seg_count(si, type)
 #define stat_inc_tot_blk_count(si, blks)
 #define stat_inc_data_blk_count(si, blks)
@@ -1619,6 +1681,7 @@ extern const struct address_space_operations f2fs_meta_aops;
 extern const struct inode_operations f2fs_dir_inode_operations;
 extern const struct inode_operations f2fs_symlink_inode_operations;
 extern const struct inode_operations f2fs_special_inode_operations;
+extern struct kmem_cache *inode_entry_slab;
 /*
 * inline.c
@@ -1629,7 +1692,6 @@ int f2fs_read_inline_data(struct inode *, struct page *);
 int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
 int f2fs_convert_inline_inode(struct inode *);
 int f2fs_write_inline_data(struct inode *, struct page *);
-void truncate_inline_data(struct page *, u64);
 bool recover_inline_data(struct inode *, struct page *);
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
                                                        struct page **);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..98dac27bc3f7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -26,6 +26,7 @@
 #include "segment.h"
 #include "xattr.h"
 #include "acl.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
@@ -92,7 +93,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = f2fs_vm_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int get_parent_ino(struct inode *inode, nid_t *pino)
@@ -246,6 +246,10 @@ go_write:
 sync_nodes:
        sync_node_pages(sbi, ino, &wbc);
+        /* if cp_error was enabled, we should avoid infinite loop */
+        if (unlikely(f2fs_cp_error(sbi)))
+                goto out;
        if (need_inode_block_update(sbi, ino)) {
                mark_inode_dirty_sync(inode);
                f2fs_write_inode(inode, NULL);
@@ -265,6 +269,7 @@ flush_out:
        ret = f2fs_issue_flush(sbi);
 out:
        trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+        f2fs_trace_ios(NULL, NULL, 1);
        return ret;
 }
@@ -351,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
                /* find data/hole in dnode block */
                for (; dn.ofs_in_node < end_offset;
                                dn.ofs_in_node++, pgofs++,
-                                data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+                                data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
                        block_t blkaddr;
                        blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
@@ -427,7 +432,8 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
                if (blkaddr == NULL_ADDR)
                        continue;
-                update_extent_cache(NULL_ADDR, dn);
+                dn->data_blkaddr = NULL_ADDR;
+                update_extent_cache(dn);
                invalidate_blocks(sbi, blkaddr);
                nr_free++;
        }
@@ -484,8 +490,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
        trace_f2fs_truncate_blocks_enter(inode, from);
-        free_from = (pgoff_t)
+        free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
-                ((from + blocksize - 1) >> (sbi->log_blocksize));
        if (lock)
                f2fs_lock_op(sbi);
@@ -836,6 +841,19 @@ static long f2fs_fallocate(struct file *file, int mode,
        return ret;
 }
+static int f2fs_release_file(struct inode *inode, struct file *filp)
+{
+        /* some remained atomic pages should discarded */
+        if (f2fs_is_atomic_file(inode))
+                commit_inmem_pages(inode, true);
+        if (f2fs_is_volatile_file(inode)) {
+                set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+                filemap_fdatawrite(inode->i_mapping);
+                clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+        }
+        return 0;
+}
 #define F2FS_REG_FLMASK         (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
 #define F2FS_OTHER_FLMASK       (FS_NODUMP_FL | FS_NOATIME_FL)
@@ -906,29 +924,30 @@ out:
        return ret;
 }
+static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
+{
+        struct inode *inode = file_inode(filp);
+        return put_user(inode->i_generation, (int __user *)arg);
+}
 static int f2fs_ioc_start_atomic_write(struct file *filp)
 {
        struct inode *inode = file_inode(filp);
-        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        if (!inode_owner_or_capable(inode))
                return -EACCES;
-        f2fs_balance_fs(sbi);
+        f2fs_balance_fs(F2FS_I_SB(inode));
+        if (f2fs_is_atomic_file(inode))
+                return 0;
        set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
        return f2fs_convert_inline_inode(inode);
 }
-static int f2fs_release_file(struct inode *inode, struct file *filp)
-{
-        /* some remained atomic pages should discarded */
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
-                commit_inmem_pages(inode, true);
-        return 0;
-}
 static int f2fs_ioc_commit_atomic_write(struct file *filp)
 {
        struct inode *inode = file_inode(filp);
@@ -949,6 +968,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
        ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
        mnt_drop_write_file(filp);
+        clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
        return ret;
 }
@@ -959,11 +979,56 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
        if (!inode_owner_or_capable(inode))
                return -EACCES;
+        if (f2fs_is_volatile_file(inode))
+                return 0;
        set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
        return f2fs_convert_inline_inode(inode);
 }
+static int f2fs_ioc_release_volatile_write(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        if (!f2fs_is_volatile_file(inode))
+                return 0;
+        punch_hole(inode, 0, F2FS_BLKSIZE);
+        return 0;
+}
+static int f2fs_ioc_abort_volatile_write(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        int ret;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        ret = mnt_want_write_file(filp);
+        if (ret)
+                return ret;
+        f2fs_balance_fs(F2FS_I_SB(inode));
+        if (f2fs_is_atomic_file(inode)) {
+                commit_inmem_pages(inode, false);
+                clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+        }
+        if (f2fs_is_volatile_file(inode)) {
+                clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+                filemap_fdatawrite(inode->i_mapping);
+                set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+        }
+        mnt_drop_write_file(filp);
+        return ret;
+}
 static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -1001,12 +1066,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return f2fs_ioc_getflags(filp, arg);
        case F2FS_IOC_SETFLAGS:
                return f2fs_ioc_setflags(filp, arg);
+        case F2FS_IOC_GETVERSION:
+                return f2fs_ioc_getversion(filp, arg);
        case F2FS_IOC_START_ATOMIC_WRITE:
                return f2fs_ioc_start_atomic_write(filp);
        case F2FS_IOC_COMMIT_ATOMIC_WRITE:
                return f2fs_ioc_commit_atomic_write(filp);
        case F2FS_IOC_START_VOLATILE_WRITE:
                return f2fs_ioc_start_volatile_write(filp);
+        case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+                return f2fs_ioc_release_volatile_write(filp);
+        case F2FS_IOC_ABORT_VOLATILE_WRITE:
+                return f2fs_ioc_abort_volatile_write(filp);
        case FITRIM:
                return f2fs_ioc_fitrim(filp, arg);
        default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index eec0933a4819..76adbc3641f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -24,8 +24,6 @@
 #include "gc.h"
 #include <trace/events/f2fs.h>
-static struct kmem_cache *winode_slab;
 static int gc_thread_func(void *data)
 {
        struct f2fs_sb_info *sbi = data;
@@ -46,7 +44,7 @@ static int gc_thread_func(void *data)
                        break;
                if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
-                        wait_ms = increase_sleep_time(gc_th, wait_ms);
+                        increase_sleep_time(gc_th, &wait_ms);
                        continue;
                }
@@ -67,15 +65,15 @@ static int gc_thread_func(void *data)
                        continue;
                if (!is_idle(sbi)) {
-                        wait_ms = increase_sleep_time(gc_th, wait_ms);
+                        increase_sleep_time(gc_th, &wait_ms);
                        mutex_unlock(&sbi->gc_mutex);
                        continue;
                }
                if (has_enough_invalid_blocks(sbi))
-                        wait_ms = decrease_sleep_time(gc_th, wait_ms);
+                        decrease_sleep_time(gc_th, &wait_ms);
                else
-                        wait_ms = increase_sleep_time(gc_th, wait_ms);
+                        increase_sleep_time(gc_th, &wait_ms);
                stat_inc_bggc_count(sbi);
@@ -356,13 +354,10 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
                iput(inode);
                return;
        }
-        new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
+        new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
        new_ie->inode = inode;
-retry:
-        if (radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie)) {
+        f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
-                cond_resched();
-                goto retry;
-        }
        list_add_tail(&new_ie->list, &gc_list->ilist);
 }
@@ -373,7 +368,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list)
                radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
                iput(ie->inode);
                list_del(&ie->list);
-                kmem_cache_free(winode_slab, ie);
+                kmem_cache_free(inode_entry_slab, ie);
        }
 }
@@ -703,8 +698,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
                .iroot = RADIX_TREE_INIT(GFP_NOFS),
        };
-        cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
+        cpc.reason = __get_cp_reason(sbi);
 gc_more:
        if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
                goto stop;
@@ -750,17 +744,3 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
 {
        DIRTY_I(sbi)->v_ops = &default_v_ops;
 }
-int __init create_gc_caches(void)
-{
-        winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
-                        sizeof(struct inode_entry));
-        if (!winode_slab)
-                return -ENOMEM;
-        return 0;
-}
-void destroy_gc_caches(void)
-{
-        kmem_cache_destroy(winode_slab);
-}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 6ff7ad38463e..b4a65be9f7d3 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -35,11 +35,6 @@ struct f2fs_gc_kthread {
        unsigned int gc_idle;
 };
-struct inode_entry {
-        struct list_head list;
-        struct inode *inode;
-};
 struct gc_inode_list {
        struct list_head ilist;
        struct radix_tree_root iroot;
@@ -69,26 +64,26 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
        return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
 }
-static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
+                                                                long *wait)
 {
-        if (wait == gc_th->no_gc_sleep_time)
+        if (*wait == gc_th->no_gc_sleep_time)
-                return wait;
+                return;
-        wait += gc_th->min_sleep_time;
+        *wait += gc_th->min_sleep_time;
-        if (wait > gc_th->max_sleep_time)
+        if (*wait > gc_th->max_sleep_time)
-                wait = gc_th->max_sleep_time;
+                *wait = gc_th->max_sleep_time;
-        return wait;
 }
-static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
+                                                                long *wait)
 {
-        if (wait == gc_th->no_gc_sleep_time)
+        if (*wait == gc_th->no_gc_sleep_time)
-                wait = gc_th->max_sleep_time;
+                *wait = gc_th->max_sleep_time;
-        wait -= gc_th->min_sleep_time;
+        *wait -= gc_th->min_sleep_time;
-        if (wait <= gc_th->min_sleep_time)
+        if (*wait <= gc_th->min_sleep_time)
-                wait = gc_th->min_sleep_time;
+                *wait = gc_th->min_sleep_time;
-        return wait;
 }
 static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index f2d3c581e776..1484c00133cd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -50,6 +50,12 @@ void read_inline_data(struct page *page, struct page *ipage)
        SetPageUptodate(page);
 }
+static void truncate_inline_data(struct page *ipage)
+{
+        f2fs_wait_on_page_writeback(ipage, NODE);
+        memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA);
+}
 int f2fs_read_inline_data(struct inode *inode, struct page *page)
 {
        struct page *ipage;
@@ -79,7 +85,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
        void *src_addr, *dst_addr;
-        block_t new_blk_addr;
        struct f2fs_io_info fio = {
                .type = DATA,
                .rw = WRITE_SYNC | REQ_PRIO,
@@ -115,9 +120,9 @@ no_update:
        /* write data page to try to make data consistent */
        set_page_writeback(page);
+        fio.blk_addr = dn->data_blkaddr;
-        write_data_page(page, dn, &new_blk_addr, &fio);
+        write_data_page(page, dn, &fio);
-        update_extent_cache(new_blk_addr, dn);
+        update_extent_cache(dn);
        f2fs_wait_on_page_writeback(page, DATA);
        if (dirty)
                inode_dec_dirty_pages(dn->inode);
@@ -126,7 +131,7 @@ no_update:
        set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
        /* clear inline data and flag after data writeback */
-        truncate_inline_data(dn->inode_page, 0);
+        truncate_inline_data(dn->inode_page);
 clear_out:
        stat_dec_inline_inode(dn->inode);
        f2fs_clear_inline_inode(dn->inode);
@@ -199,19 +204,6 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
        return 0;
 }
-void truncate_inline_data(struct page *ipage, u64 from)
-{
-        void *addr;
-        if (from >= MAX_INLINE_DATA)
-                return;
-        f2fs_wait_on_page_writeback(ipage, NODE);
-        addr = inline_data_addr(ipage);
-        memset(addr + from, 0, MAX_INLINE_DATA - from);
-}
 bool recover_inline_data(struct inode *inode, struct page *npage)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -253,7 +245,7 @@ process_inline:
        if (f2fs_has_inline_data(inode)) {
                ipage = get_node_page(sbi, inode->i_ino);
                f2fs_bug_on(sbi, IS_ERR(ipage));
-                truncate_inline_data(ipage, 0);
+                truncate_inline_data(ipage);
                f2fs_clear_inline_inode(inode);
                update_inode(inode, ipage);
                f2fs_put_page(ipage, 1);
@@ -371,7 +363,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
        set_page_dirty(page);
        /* clear inline dir and flag after data writeback */
-        truncate_inline_data(ipage, 0);
+        truncate_inline_data(ipage);
        stat_dec_inline_dir(dir);
        clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 196cc7843aaf..2d002e3738a7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -67,29 +67,23 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
        }
 }
-static int __recover_inline_status(struct inode *inode, struct page *ipage)
+static void __recover_inline_status(struct inode *inode, struct page *ipage)
 {
        void *inline_data = inline_data_addr(ipage);
-        struct f2fs_inode *ri;
+        __le32 *start = inline_data;
-        void *zbuf;
+        __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
-        zbuf = kzalloc(MAX_INLINE_DATA, GFP_NOFS);
+        while (start < end) {
-        if (!zbuf)
+                if (*start++) {
-                return -ENOMEM;
+                        f2fs_wait_on_page_writeback(ipage, NODE);
-        if (!memcmp(zbuf, inline_data, MAX_INLINE_DATA)) {
+                        set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-                kfree(zbuf);
+                        set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
-                return 0;
+                        set_page_dirty(ipage);
+                        return;
+                }
        }
-        kfree(zbuf);
+        return;
-        f2fs_wait_on_page_writeback(ipage, NODE);
-        set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-        ri = F2FS_INODE(ipage);
-        set_raw_inline(F2FS_I(inode), ri);
-        set_page_dirty(ipage);
-        return 0;
 }
 static int do_read_inode(struct inode *inode)
@@ -98,7 +92,6 @@ static int do_read_inode(struct inode *inode)
        struct f2fs_inode_info *fi = F2FS_I(inode);
        struct page *node_page;
        struct f2fs_inode *ri;
-        int err = 0;
        /* Check if ino is within scope */
        if (check_nid_range(sbi, inode->i_ino)) {
@@ -142,7 +135,7 @@ static int do_read_inode(struct inode *inode)
        /* check data exist */
        if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
-                err = __recover_inline_status(inode, node_page);
+                __recover_inline_status(inode, node_page);
        /* get rdev by using inline_info */
        __get_inode_rdev(inode, ri);
@@ -152,7 +145,7 @@ static int do_read_inode(struct inode *inode)
        stat_inc_inline_inode(inode);
        stat_inc_inline_dir(inode);
-        return err;
+        return 0;
 }
 struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
@@ -304,7 +297,7 @@ void f2fs_evict_inode(struct inode *inode)
        nid_t xnid = F2FS_I(inode)->i_xattr_nid;
        /* some remained atomic pages should discarded */
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+        if (f2fs_is_atomic_file(inode))
                commit_inmem_pages(inode, true);
        trace_f2fs_evict_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 547a2deeb1ac..e79639a9787a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -299,7 +299,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_op = &f2fs_dir_inode_operations;
        inode->i_fop = &f2fs_dir_operations;
        inode->i_mapping->a_ops = &f2fs_dblock_aops;
-        mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+        mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
        f2fs_lock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f83326ca32ef..97bd9d3db882 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 #define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
@@ -57,12 +58,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
        } else if (type == INO_ENTRIES) {
                int i;
-                if (sbi->sb->s_bdi->dirty_exceeded)
-                        return false;
                for (i = 0; i <= UPDATE_INO; i++)
                        mem_size += (sbi->im[i].ino_num *
                                sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+        } else {
+                if (sbi->sb->s_bdi->dirty_exceeded)
+                        return false;
        }
        return res;
 }
@@ -268,7 +270,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
        e = __lookup_nat_cache(nm_i, ni->nid);
        if (!e) {
                e = grab_nat_entry(nm_i, ni->nid);
-                e->ni = *ni;
+                copy_node_info(&e->ni, ni);
                f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
        } else if (new_blkaddr == NEW_ADDR) {
                /*
@@ -276,7 +278,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
                 * previous nat entry can be remained in nat cache.
                 * So, reinitialize it with new information.
                 */
-                e->ni = *ni;
+                copy_node_info(&e->ni, ni);
                f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
        }
@@ -346,7 +348,6 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        struct nat_entry *e;
        int i;
-        memset(&ne, 0, sizeof(struct f2fs_nat_entry));
        ni->nid = nid;
        /* Check nat cache */
@@ -361,6 +362,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        if (e)
                return;
+        memset(&ne, 0, sizeof(struct f2fs_nat_entry));
        /* Check current segment summary */
        mutex_lock(&curseg->curseg_mutex);
        i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -471,7 +474,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct page *npage[4];
-        struct page *parent;
+        struct page *parent = NULL;
        int offset[4];
        unsigned int noffset[4];
        nid_t nids[4];
@@ -488,6 +491,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
                if (IS_ERR(npage[0]))
                        return PTR_ERR(npage[0]);
        }
+        /* if inline_data is set, should not report any block indices */
+        if (f2fs_has_inline_data(dn->inode) && index) {
+                err = -EINVAL;
+                f2fs_put_page(npage[0], 1);
+                goto release_out;
+        }
        parent = npage[0];
        if (level != 0)
                nids[1] = get_nid(parent, offset[0], true);
@@ -585,7 +596,7 @@ static void truncate_node(struct dnode_of_data *dn)
        }
 invalidate:
        clear_node_page_dirty(dn->node_page);
-        F2FS_SET_SB_DIRT(sbi);
+        set_sbi_flag(sbi, SBI_IS_DIRTY);
        f2fs_put_page(dn->node_page, 1);
@@ -976,6 +987,10 @@ static int read_node_page(struct page *page, int rw)
 {
        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
        struct node_info ni;
+        struct f2fs_io_info fio = {
+                .type = NODE,
+                .rw = rw,
+        };
        get_node_info(sbi, page->index, &ni);
@@ -987,7 +1002,8 @@ static int read_node_page(struct page *page, int rw)
        if (PageUptodate(page))
                return LOCKED_PAGE;
-        return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
+        fio.blk_addr = ni.blk_addr;
+        return f2fs_submit_page_bio(sbi, page, &fio);
 }
 /*
@@ -1028,11 +1044,11 @@ repeat:
        err = read_node_page(page, READ_SYNC);
        if (err < 0)
                return ERR_PTR(err);
-        else if (err == LOCKED_PAGE)
+        else if (err != LOCKED_PAGE)
-                goto got_it;
+                lock_page(page);
-        lock_page(page);
        if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
+                ClearPageUptodate(page);
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
@@ -1040,7 +1056,6 @@ repeat:
                f2fs_put_page(page, 1);
                goto repeat;
        }
-got_it:
        return page;
 }
@@ -1268,7 +1283,6 @@ static int f2fs_write_node_page(struct page *page,
 {
        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
        nid_t nid;
-        block_t new_addr;
        struct node_info ni;
        struct f2fs_io_info fio = {
                .type = NODE,
@@ -1277,7 +1291,7 @@ static int f2fs_write_node_page(struct page *page,
        trace_f2fs_writepage(page, NODE);
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                goto redirty_out;
        if (unlikely(f2fs_cp_error(sbi)))
                goto redirty_out;
@@ -1303,9 +1317,11 @@ static int f2fs_write_node_page(struct page *page,
        } else {
                down_read(&sbi->node_write);
        }
        set_page_writeback(page);
-        write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
+        fio.blk_addr = ni.blk_addr;
-        set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
+        write_node_page(sbi, page, nid, &fio);
+        set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        up_read(&sbi->node_write);
        unlock_page(page);
@@ -1355,26 +1371,12 @@ static int f2fs_set_node_page_dirty(struct page *page)
                __set_page_dirty_nobuffers(page);
                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
                SetPagePrivate(page);
+                f2fs_trace_pid(page);
                return 1;
        }
        return 0;
 }
-static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
-                                      unsigned int length)
-{
-        struct inode *inode = page->mapping->host;
-        if (PageDirty(page))
-                dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
-        ClearPagePrivate(page);
-}
-static int f2fs_release_node_page(struct page *page, gfp_t wait)
-{
-        ClearPagePrivate(page);
-        return 1;
-}
 /*
 * Structure of the f2fs node operations
 */
@@ -1382,8 +1384,8 @@ const struct address_space_operations f2fs_node_aops = {
        .writepage      = f2fs_write_node_page,
        .writepages     = f2fs_write_node_pages,
        .set_page_dirty = f2fs_set_node_page_dirty,
-        .invalidatepage = f2fs_invalidate_node_page,
+        .invalidatepage = f2fs_invalidate_page,
-        .releasepage    = f2fs_release_node_page,
+        .releasepage    = f2fs_release_page,
 };
 static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1726,80 +1728,41 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        return 0;
 }
-/*
- * ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-read pages are allocated in bd_inode's mapping tree.
- */
-static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
-                                int start, int nrpages)
-{
-        struct inode *inode = sbi->sb->s_bdev->bd_inode;
-        struct address_space *mapping = inode->i_mapping;
-        int i, page_idx = start;
-        struct f2fs_io_info fio = {
-                .type = META,
-                .rw = READ_SYNC | REQ_META | REQ_PRIO
-        };
-        for (i = 0; page_idx < start + nrpages; page_idx++, i++) {
-                /* alloc page in bd_inode for reading node summary info */
-                pages[i] = grab_cache_page(mapping, page_idx);
-                if (!pages[i])
-                        break;
-                f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio);
-        }
-        f2fs_submit_merged_bio(sbi, META, READ);
-        return i;
-}
 int restore_node_summary(struct f2fs_sb_info *sbi,
                        unsigned int segno, struct f2fs_summary_block *sum)
 {
        struct f2fs_node *rn;
        struct f2fs_summary *sum_entry;
-        struct inode *inode = sbi->sb->s_bdev->bd_inode;
        block_t addr;
        int bio_blocks = MAX_BIO_BLOCKS(sbi);
-        struct page *pages[bio_blocks];
+        int i, idx, last_offset, nrpages;
-        int i, idx, last_offset, nrpages, err = 0;
        /* scan the node segment */
        last_offset = sbi->blocks_per_seg;
        addr = START_BLOCK(sbi, segno);
        sum_entry = &sum->entries[0];
-        for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
+        for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
                nrpages = min(last_offset - i, bio_blocks);
                /* readahead node pages */
-                nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
+                ra_meta_pages(sbi, addr, nrpages, META_POR);
-                if (!nrpages)
-                        return -ENOMEM;
-                for (idx = 0; idx < nrpages; idx++) {
+                for (idx = addr; idx < addr + nrpages; idx++) {
-                        if (err)
+                        struct page *page = get_meta_page(sbi, idx);
-                                goto skip;
-                        lock_page(pages[idx]);
+                        rn = F2FS_NODE(page);
-                        if (unlikely(!PageUptodate(pages[idx]))) {
+                        sum_entry->nid = rn->footer.nid;
-                                err = -EIO;
+                        sum_entry->version = 0;
-                        } else {
+                        sum_entry->ofs_in_node = 0;
-                                rn = F2FS_NODE(pages[idx]);
+                        sum_entry++;
-                                sum_entry->nid = rn->footer.nid;
+                        f2fs_put_page(page, 1);
-                                sum_entry->version = 0;
-                                sum_entry->ofs_in_node = 0;
-                                sum_entry++;
-                        }
-                        unlock_page(pages[idx]);
-skip:
-                        page_cache_release(pages[idx]);
                }
-                invalidate_mapping_pages(inode->i_mapping, addr,
+                invalidate_mapping_pages(META_MAPPING(sbi), addr,
                                                        addr + nrpages);
        }
-        return err;
+        return 0;
 }
 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
@@ -1923,7 +1886,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
-        struct nat_entry_set *setvec[NATVEC_SIZE];
+        struct nat_entry_set *setvec[SETVEC_SIZE];
        struct nat_entry_set *set, *tmp;
        unsigned int found;
        nid_t set_idx = 0;
@@ -1940,7 +1903,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
                remove_nats_in_journal(sbi);
        while ((found = __gang_lookup_nat_set(nm_i,
-                                        set_idx, NATVEC_SIZE, setvec))) {
+                                        set_idx, SETVEC_SIZE, setvec))) {
                unsigned idx;
                set_idx = setvec[found - 1]->set + 1;
                for (idx = 0; idx < found; idx++)
@@ -2020,6 +1983,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct free_nid *i, *next_i;
        struct nat_entry *natvec[NATVEC_SIZE];
+        struct nat_entry_set *setvec[SETVEC_SIZE];
        nid_t nid = 0;
        unsigned int found;
@@ -2044,11 +2008,27 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        while ((found = __gang_lookup_nat_cache(nm_i,
                                        nid, NATVEC_SIZE, natvec))) {
                unsigned idx;
                nid = nat_get_nid(natvec[found - 1]) + 1;
                for (idx = 0; idx < found; idx++)
                        __del_from_nat_cache(nm_i, natvec[idx]);
        }
        f2fs_bug_on(sbi, nm_i->nat_cnt);
+        /* destroy nat set cache */
+        nid = 0;
+        while ((found = __gang_lookup_nat_set(nm_i,
+                                        nid, SETVEC_SIZE, setvec))) {
+                unsigned idx;
+                nid = setvec[found - 1]->set + 1;
+                for (idx = 0; idx < found; idx++) {
+                        /* entry_cnt is not zero, when cp_error was occurred */
+                        f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
+                        radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
+                        kmem_cache_free(nat_entry_set_slab, setvec[idx]);
+                }
+        }
        up_write(&nm_i->nat_tree_lock);
        kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d10b6448a671..f405bbf2435a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,10 +25,19 @@
 /* vector size for gang look-up from nat cache that consists of radix tree */
 #define NATVEC_SIZE     64
+#define SETVEC_SIZE     32
 /* return value for read_node_page */
 #define LOCKED_PAGE     1
+/* For flag in struct node_info */
+enum {
+        IS_CHECKPOINTED,        /* is it checkpointed before? */
+        HAS_FSYNCED_INODE,      /* is the inode fsynced before? */
+        HAS_LAST_FSYNC,         /* has the latest node fsync mark? */
+        IS_DIRTY,               /* this nat entry is dirty? */
+};
 /*
 * For node information
 */
@@ -37,18 +46,11 @@ struct node_info {
        nid_t ino;              /* inode number of the node's owner */
        block_t blk_addr;       /* block address of the node */
        unsigned char version;  /* version of the node */
-};
+        unsigned char flag;     /* for node information bits */
-enum {
-        IS_CHECKPOINTED,        /* is it checkpointed before? */
-        HAS_FSYNCED_INODE,      /* is the inode fsynced before? */
-        HAS_LAST_FSYNC,         /* has the latest node fsync mark? */
-        IS_DIRTY,               /* this nat entry is dirty? */
 };
 struct nat_entry {
        struct list_head list;  /* for clean or dirty nat list */
-        unsigned char flag;     /* for node information bits */
        struct node_info ni;    /* in-memory node information */
 };
@@ -63,20 +65,30 @@ struct nat_entry {
 #define inc_node_version(version)       (++version)
+static inline void copy_node_info(struct node_info *dst,
+                                                struct node_info *src)
+{
+        dst->nid = src->nid;
+        dst->ino = src->ino;
+        dst->blk_addr = src->blk_addr;
+        dst->version = src->version;
+        /* should not copy flag here */
+}
 static inline void set_nat_flag(struct nat_entry *ne,
                                unsigned int type, bool set)
 {
        unsigned char mask = 0x01 << type;
        if (set)
-                ne->flag |= mask;
+                ne->ni.flag |= mask;
        else
-                ne->flag &= ~mask;
+                ne->ni.flag &= ~mask;
 }
 static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
 {
        unsigned char mask = 0x01 << type;
-        return ne->flag & mask;
+        return ne->ni.flag & mask;
 }
 static inline void nat_reset_flag(struct nat_entry *ne)
@@ -108,6 +120,7 @@ enum mem_type {
        NAT_ENTRIES,    /* indicates the cached nat entry */
        DIRTY_DENTS,    /* indicates dirty dentry pages */
        INO_ENTRIES,    /* indicates inode entries */
+        BASE_CHECK,     /* check kernel status */
 };
 struct nat_entry_set {
@@ -200,11 +213,19 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
                                nid_t ino, unsigned int ofs, bool reset)
 {
        struct f2fs_node *rn = F2FS_NODE(page);
+        unsigned int old_flag = 0;
        if (reset)
                memset(rn, 0, sizeof(*rn));
+        else
+                old_flag = le32_to_cpu(rn->footer.flag);
        rn->footer.nid = cpu_to_le32(nid);
        rn->footer.ino = cpu_to_le32(ino);
-        rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+        /* should remain old flag bits such as COLD_BIT_SHIFT */
+        rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
+                                        (old_flag & OFFSET_BIT_MASK));
 }
 static inline void copy_node_footer(struct page *dst, struct page *src)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9160a37e1c7a..41afb9534bbd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -346,6 +346,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        if (IS_INODE(page)) {
                recover_inline_xattr(inode, page);
        } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+                /*
+                 * Deprecated; xattr blocks should be found from cold log.
+                 * But, we should remain this for backward compatibility.
+                 */
                recover_xattr_data(inode, page, blkaddr);
                goto out;
        }
@@ -396,7 +400,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                        /* write dummy data page */
                        recover_data_page(sbi, NULL, &sum, src, dest);
-                        update_extent_cache(dest, &dn);
+                        dn.data_blkaddr = dest;
+                        update_extent_cache(&dn);
                        recovered++;
                }
                dn.ofs_in_node++;
@@ -503,7 +508,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        INIT_LIST_HEAD(&inode_list);
        /* step #1: find fsynced inode numbers */
-        sbi->por_doing = true;
+        set_sbi_flag(sbi, SBI_POR_DOING);
        /* prevent checkpoint */
        mutex_lock(&sbi->cp_mutex);
@@ -536,7 +541,7 @@ out:
                truncate_inode_pages_final(META_MAPPING(sbi));
        }
-        sbi->por_doing = false;
+        clear_sbi_flag(sbi, SBI_POR_DOING);
        if (err) {
                discard_next_dnode(sbi, blkaddr);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 42607a679923..daee4ab913da 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,7 @@
 #include "f2fs.h"
 #include "segment.h"
 #include "node.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 #define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -181,6 +182,7 @@ void register_inmem_page(struct inode *inode, struct page *page)
        int err;
        SetPagePrivate(page);
+        f2fs_trace_pid(page);
        new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -205,23 +207,6 @@ retry:
        mutex_unlock(&fi->inmem_lock);
 }
-void invalidate_inmem_page(struct inode *inode, struct page *page)
-{
-        struct f2fs_inode_info *fi = F2FS_I(inode);
-        struct inmem_pages *cur;
-        mutex_lock(&fi->inmem_lock);
-        cur = radix_tree_lookup(&fi->inmem_root, page->index);
-        if (cur) {
-                radix_tree_delete(&fi->inmem_root, cur->page->index);
-                f2fs_put_page(cur->page, 0);
-                list_del(&cur->list);
-                kmem_cache_free(inmem_entry_slab, cur);
-                dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
-        }
-        mutex_unlock(&fi->inmem_lock);
-}
 void commit_inmem_pages(struct inode *inode, bool abort)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -230,7 +215,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
        bool submit_bio = false;
        struct f2fs_io_info fio = {
                .type = DATA,
-                .rw = WRITE_SYNC,
+                .rw = WRITE_SYNC | REQ_PRIO,
        };
        /*
@@ -240,33 +225,38 @@ void commit_inmem_pages(struct inode *inode, bool abort)
         * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
         * inode becomes free by iget_locked in f2fs_iget.
         */
-        if (!abort)
+        if (!abort) {
                f2fs_balance_fs(sbi);
+                f2fs_lock_op(sbi);
-        f2fs_lock_op(sbi);
+        }
        mutex_lock(&fi->inmem_lock);
        list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
-                lock_page(cur->page);
+                if (!abort) {
-                if (!abort && cur->page->mapping == inode->i_mapping) {
+                        lock_page(cur->page);
-                        f2fs_wait_on_page_writeback(cur->page, DATA);
+                        if (cur->page->mapping == inode->i_mapping) {
-                        if (clear_page_dirty_for_io(cur->page))
+                                f2fs_wait_on_page_writeback(cur->page, DATA);
-                                inode_dec_dirty_pages(inode);
+                                if (clear_page_dirty_for_io(cur->page))
-                        do_write_data_page(cur->page, &fio);
+                                        inode_dec_dirty_pages(inode);
-                        submit_bio = true;
+                                do_write_data_page(cur->page, &fio);
+                                submit_bio = true;
+                        }
+                        f2fs_put_page(cur->page, 1);
+                } else {
+                        put_page(cur->page);
                }
                radix_tree_delete(&fi->inmem_root, cur->page->index);
-                f2fs_put_page(cur->page, 1);
                list_del(&cur->list);
                kmem_cache_free(inmem_entry_slab, cur);
                dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
        }
-        if (submit_bio)
-                f2fs_submit_merged_bio(sbi, DATA, WRITE);
        mutex_unlock(&fi->inmem_lock);
-        filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+        if (!abort) {
-        f2fs_unlock_op(sbi);
+                f2fs_unlock_op(sbi);
+                if (submit_bio)
+                        f2fs_submit_merged_bio(sbi, DATA, WRITE);
+        }
 }
 /*
@@ -290,7 +280,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
        /* check the # of cached NAT entries and prefree segments */
        if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
                        excess_prefree_segs(sbi) ||
-                        available_free_memory(sbi, INO_ENTRIES))
+                        !available_free_memory(sbi, INO_ENTRIES))
                f2fs_sync_fs(sbi->sb, true);
 }
@@ -515,12 +505,13 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
-        unsigned long dmap[entries];
+        unsigned long *dmap = SIT_I(sbi)->tmp_map;
        unsigned int start = 0, end = -1;
        bool force = (cpc->reason == CP_DISCARD);
        int i;
-        if (!force && !test_opt(sbi, DISCARD))
+        if (!force && (!test_opt(sbi, DISCARD) ||
+                        SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
                return;
        if (force && !se->valid_blocks) {
@@ -548,7 +539,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
        for (i = 0; i < entries; i++)
-                dmap[i] = ~(cur_map[i] | ckpt_map[i]);
+                dmap[i] = force ? ~ckpt_map[i] :
+                                (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
        while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
                start = __find_rev_next_bit(dmap, max_blocks, end + 1);
@@ -735,7 +727,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
 /*
 * Calculate the number of current summary pages for writing
 */
-int npages_for_summary_flush(struct f2fs_sb_info *sbi)
+int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
 {
        int valid_sum_count = 0;
        int i, sum_in_page;
@@ -743,8 +735,13 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
                if (sbi->ckpt->alloc_type[i] == SSR)
                        valid_sum_count += sbi->blocks_per_seg;
-                else
+                else {
-                        valid_sum_count += curseg_blkoff(sbi, i);
+                        if (for_ra)
+                                valid_sum_count += le16_to_cpu(
+                                        F2FS_CKPT(sbi)->cur_data_blkoff[i]);
+                        else
+                                valid_sum_count += curseg_blkoff(sbi, i);
+                }
        }
        sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
@@ -803,7 +800,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
        int go_left = 0;
        int i;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
                segno = find_next_zero_bit(free_i->free_segmap,
@@ -876,7 +873,7 @@ got_it:
        f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
        __set_inuse(sbi, segno);
        *newseg = segno;
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -927,7 +924,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
 {
        struct seg_entry *se = get_seg_entry(sbi, seg->segno);
        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
-        unsigned long target_map[entries];
+        unsigned long *target_map = SIT_I(sbi)->tmp_map;
        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
        int i, pos;
@@ -1027,18 +1024,22 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
        stat_inc_seg_type(sbi, curseg);
 }
+static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        unsigned int old_segno;
+        old_segno = curseg->segno;
+        SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+        locate_dirty_segment(sbi, old_segno);
+}
 void allocate_new_segments(struct f2fs_sb_info *sbi)
 {
-        struct curseg_info *curseg;
-        unsigned int old_curseg;
        int i;
-        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
-                curseg = CURSEG_I(sbi, i);
+                __allocate_new_segments(sbi, i);
-                old_curseg = curseg->segno;
-                SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
-                locate_dirty_segment(sbi, old_curseg);
-        }
 }
 static const struct segment_allocation default_salloc_ops = {
@@ -1047,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = {
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
-        __u64 start = range->start >> sbi->log_blocksize;
+        __u64 start = F2FS_BYTES_TO_BLK(range->start);
-        __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+        __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
        unsigned int start_segno, end_segno;
        struct cp_control cpc;
@@ -1065,16 +1066,21 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
        end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
                                                GET_SEGNO(sbi, end);
        cpc.reason = CP_DISCARD;
-        cpc.trim_start = start_segno;
+        cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
-        cpc.trim_end = end_segno;
-        cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
        /* do checkpoint to issue discard commands safely */
-        mutex_lock(&sbi->gc_mutex);
+        for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
-        write_checkpoint(sbi, &cpc);
+                cpc.trim_start = start_segno;
-        mutex_unlock(&sbi->gc_mutex);
+                cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
+                                BATCHED_TRIM_SEGMENTS(sbi),
+                                sbi->segs_per_sec) - 1, end_segno);
+                mutex_lock(&sbi->gc_mutex);
+                write_checkpoint(sbi, &cpc);
+                mutex_unlock(&sbi->gc_mutex);
+        }
 out:
-        range->len = cpc.trimmed << sbi->log_blocksize;
+        range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
        return 0;
 }
@@ -1151,11 +1157,18 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg;
+        bool direct_io = (type == CURSEG_DIRECT_IO);
+        type = direct_io ? CURSEG_WARM_DATA : type;
        curseg = CURSEG_I(sbi, type);
        mutex_lock(&curseg->curseg_mutex);
+        /* direct_io'ed data is aligned to the segment for better performance */
+        if (direct_io && curseg->next_blkoff)
+                __allocate_new_segments(sbi, type);
        *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        /*
@@ -1187,39 +1200,39 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 }
 static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
-                        block_t old_blkaddr, block_t *new_blkaddr,
+                        struct f2fs_summary *sum,
-                        struct f2fs_summary *sum, struct f2fs_io_info *fio)
+                        struct f2fs_io_info *fio)
 {
        int type = __get_segment_type(page, fio->type);
-        allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+        allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
        /* writeout dirty page into bdev */
-        f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
+        f2fs_submit_page_mbio(sbi, page, fio);
 }
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
        struct f2fs_io_info fio = {
                .type = META,
-                .rw = WRITE_SYNC | REQ_META | REQ_PRIO
+                .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
+                .blk_addr = page->index,
        };
        set_page_writeback(page);
-        f2fs_submit_page_mbio(sbi, page, page->index, &fio);
+        f2fs_submit_page_mbio(sbi, page, &fio);
 }
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
-                struct f2fs_io_info *fio,
+                        unsigned int nid, struct f2fs_io_info *fio)
-                unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
 {
        struct f2fs_summary sum;
        set_summary(&sum, nid, 0, 0);
-        do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
+        do_write_page(sbi, page, &sum, fio);
 }
 void write_data_page(struct page *page, struct dnode_of_data *dn,
-                block_t *new_blkaddr, struct f2fs_io_info *fio)
+                                struct f2fs_io_info *fio)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct f2fs_summary sum;
@@ -1228,14 +1241,14 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
        f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+        do_write_page(sbi, page, &sum, fio);
-        do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
+        dn->data_blkaddr = fio->blk_addr;
 }
-void rewrite_data_page(struct page *page, block_t old_blkaddr,
+void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
-                                        struct f2fs_io_info *fio)
 {
-        f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
+        stat_inc_inplace_blocks(F2FS_P_SB(page));
+        f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
 }
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1393,7 +1406,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
                segno = le32_to_cpu(ckpt->cur_data_segno[type]);
                blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
                                                        CURSEG_HOT_DATA]);
-                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+                if (__exist_node_summaries(sbi))
                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
                else
                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1402,7 +1415,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
                                                        CURSEG_HOT_NODE]);
                blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
                                                        CURSEG_HOT_NODE]);
-                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+                if (__exist_node_summaries(sbi))
                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
                                                        type - CURSEG_HOT_NODE);
                else
@@ -1413,7 +1426,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
        sum = (struct f2fs_summary_block *)page_address(new);
        if (IS_NODESEG(type)) {
-                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+                if (__exist_node_summaries(sbi)) {
                        struct f2fs_summary *ns = &sum->entries[0];
                        int i;
                        for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1450,12 +1463,22 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
        int err;
        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+                int npages = npages_for_summary_flush(sbi, true);
+                if (npages >= 2)
+                        ra_meta_pages(sbi, start_sum_block(sbi), npages,
+                                                                META_CP);
                /* restore for compacted data summary */
                if (read_compacted_summaries(sbi))
                        return -EINVAL;
                type = CURSEG_HOT_NODE;
        }
+        if (__exist_node_summaries(sbi))
+                ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
+                                        NR_CURSEG_TYPE - type, META_CP);
        for (; type <= CURSEG_COLD_NODE; type++) {
                err = read_normal_summaries(sbi, type);
                if (err)
@@ -1549,8 +1572,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
+        write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
-                write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
 }
 int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
@@ -1754,7 +1776,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        se = get_seg_entry(sbi, segno);
                        /* add discard candidates */
-                        if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
+                        if (cpc->reason != CP_DISCARD) {
                                cpc->trim_start = segno;
                                add_discard_addrs(sbi, cpc);
                        }
@@ -1833,6 +1855,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
                        return -ENOMEM;
        }
+        sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+        if (!sit_i->tmp_map)
+                return -ENOMEM;
        if (sbi->segs_per_sec > 1) {
                sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
                                        sizeof(struct sec_entry));
@@ -1897,7 +1923,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
        free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
        free_i->free_segments = 0;
        free_i->free_sections = 0;
-        rwlock_init(&free_i->segmap_lock);
+        spin_lock_init(&free_i->segmap_lock);
        return 0;
 }
@@ -2110,6 +2136,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
        sm_info->nr_discards = 0;
        sm_info->max_discards = 0;
+        sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
        INIT_LIST_HEAD(&sm_info->sit_entry_set);
        if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
@@ -2212,6 +2240,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
                        kfree(sit_i->sentries[start].ckpt_valid_map);
                }
        }
+        kfree(sit_i->tmp_map);
        vfree(sit_i->sentries);
        vfree(sit_i->sec_entries);
        kfree(sit_i->dirty_sentries_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7f327c0ba4e3..7fd35111cf62 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -189,6 +189,7 @@ struct sit_info {
        char *sit_bitmap;               /* SIT bitmap pointer */
        unsigned int bitmap_size;       /* SIT bitmap size */
+        unsigned long *tmp_map;                 /* bitmap for temporal use */
        unsigned long *dirty_sentries_bitmap;   /* bitmap for dirty sentries */
        unsigned int dirty_sentries;            /* # of dirty sentries */
        unsigned int sents_per_block;           /* # of SIT entries per block */
@@ -207,7 +208,7 @@ struct free_segmap_info {
        unsigned int start_segno;       /* start segment number logically */
        unsigned int free_segments;     /* # of free segments */
        unsigned int free_sections;     /* # of free sections */
-        rwlock_t segmap_lock;           /* free segmap lock */
+        spinlock_t segmap_lock;         /* free segmap lock */
        unsigned long *free_segmap;     /* free segment bitmap */
        unsigned long *free_secmap;     /* free section bitmap */
 };
@@ -318,9 +319,9 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
                unsigned int max, unsigned int segno)
 {
        unsigned int ret;
-        read_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        ret = find_next_bit(free_i->free_segmap, max, segno);
-        read_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
        return ret;
 }
@@ -331,7 +332,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
        unsigned int start_segno = secno * sbi->segs_per_sec;
        unsigned int next;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        clear_bit(segno, free_i->free_segmap);
        free_i->free_segments++;
@@ -340,7 +341,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
                clear_bit(secno, free_i->free_secmap);
                free_i->free_sections++;
        }
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static inline void __set_inuse(struct f2fs_sb_info *sbi,
@@ -362,7 +363,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
        unsigned int start_segno = secno * sbi->segs_per_sec;
        unsigned int next;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        if (test_and_clear_bit(segno, free_i->free_segmap)) {
                free_i->free_segments++;
@@ -373,7 +374,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
                                free_i->free_sections++;
                }
        }
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
@@ -381,13 +382,13 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
 {
        struct free_segmap_info *free_i = FREE_I(sbi);
        unsigned int secno = segno / sbi->segs_per_sec;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        if (!test_and_set_bit(segno, free_i->free_segmap)) {
                free_i->free_segments--;
                if (!test_and_set_bit(secno, free_i->free_secmap))
                        free_i->free_sections--;
        }
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
@@ -460,7 +461,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
        int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
        int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                return false;
        return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
@@ -599,13 +600,13 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 {
        if (segno > TOTAL_SEGS(sbi) - 1)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
        if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 /*
@@ -616,11 +617,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 {
        /* check segment usage */
        if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
        /* check boundary of a given segment number */
        if (segno > TOTAL_SEGS(sbi) - 1)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 #endif
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f71421d70475..f2fe666a6ea9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -30,6 +30,7 @@
 #include "segment.h"
 #include "xattr.h"
 #include "gc.h"
+#include "trace.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/f2fs.h>
@@ -41,6 +42,7 @@ static struct kset *f2fs_kset;
 enum {
        Opt_gc_background,
        Opt_disable_roll_forward,
+        Opt_norecovery,
        Opt_discard,
        Opt_noheap,
        Opt_user_xattr,
@@ -61,6 +63,7 @@ enum {
 static match_table_t f2fs_tokens = {
        {Opt_gc_background, "background_gc=%s"},
        {Opt_disable_roll_forward, "disable_roll_forward"},
+        {Opt_norecovery, "norecovery"},
        {Opt_discard, "discard"},
        {Opt_noheap, "no_heap"},
        {Opt_user_xattr, "user_xattr"},
@@ -192,6 +195,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -207,6 +211,7 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(gc_idle),
        ATTR_LIST(reclaim_segments),
        ATTR_LIST(max_small_discards),
+        ATTR_LIST(batched_trim_sections),
        ATTR_LIST(ipu_policy),
        ATTR_LIST(min_ipu_util),
        ATTR_LIST(min_fsync_blocks),
@@ -286,6 +291,12 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_disable_roll_forward:
                        set_opt(sbi, DISABLE_ROLL_FORWARD);
                        break;
+                case Opt_norecovery:
+                        /* this option mounts f2fs with ro */
+                        set_opt(sbi, DISABLE_ROLL_FORWARD);
+                        if (!f2fs_readonly(sb))
+                                return -EINVAL;
+                        break;
                case Opt_discard:
                        set_opt(sbi, DISCARD);
                        break;
@@ -446,8 +457,13 @@ static void f2fs_put_super(struct super_block *sb)
        f2fs_destroy_stats(sbi);
        stop_gc_thread(sbi);
-        /* We don't need to do checkpoint when it's clean */
+        /*
-        if (sbi->s_dirty) {
+         * We don't need to do checkpoint when superblock is clean.
+         * But, the previous checkpoint was not done by umount, it needs to do
+         * clean checkpoint again.
+         */
+        if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+                        !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
                struct cp_control cpc = {
                        .reason = CP_UMOUNT,
                };
@@ -486,13 +502,15 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
        if (sync) {
                struct cp_control cpc;
-                cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
+                cpc.reason = __get_cp_reason(sbi);
                mutex_lock(&sbi->gc_mutex);
                write_checkpoint(sbi, &cpc);
                mutex_unlock(&sbi->gc_mutex);
        } else {
                f2fs_balance_fs(sbi);
        }
+        f2fs_trace_ios(NULL, NULL, 1);
        return 0;
 }
@@ -887,7 +905,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
                atomic_set(&sbi->nr_pages[i], 0);
        sbi->dir_level = DEF_DIR_LEVEL;
-        sbi->need_fsck = false;
+        clear_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 /*
@@ -942,6 +960,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root;
        long err = -EINVAL;
        bool retry = true;
+        char *options = NULL;
        int i;
 try_onemore:
@@ -973,9 +992,15 @@ try_onemore:
        set_opt(sbi, POSIX_ACL);
 #endif
        /* parse mount options */
-        err = parse_options(sb, (char *)data);
+        options = kstrdup((const char *)data, GFP_KERNEL);
-        if (err)
+        if (data && !options) {
+                err = -ENOMEM;
                goto free_sb_buf;
+        }
+        err = parse_options(sb, options);
+        if (err)
+                goto free_options;
        sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
        sb->s_max_links = F2FS_LINK_MAX;
@@ -998,7 +1023,7 @@ try_onemore:
        mutex_init(&sbi->writepages);
        mutex_init(&sbi->cp_mutex);
        init_rwsem(&sbi->node_write);
-        sbi->por_doing = false;
+        clear_sbi_flag(sbi, SBI_POR_DOING);
        spin_lock_init(&sbi->stat_lock);
        init_rwsem(&sbi->read_io.io_rwsem);
@@ -1019,7 +1044,7 @@ try_onemore:
        if (IS_ERR(sbi->meta_inode)) {
                f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
                err = PTR_ERR(sbi->meta_inode);
-                goto free_sb_buf;
+                goto free_options;
        }
        err = get_valid_checkpoint(sbi);
@@ -1122,10 +1147,19 @@ try_onemore:
                goto free_proc;
        if (!retry)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
        /* recover fsynced data */
        if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+                /*
+                 * mount should be failed, when device has readonly mode, and
+                 * previous checkpoint was not done by clean system shutdown.
+                 */
+                if (bdev_read_only(sb->s_bdev) &&
+                                !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
+                        err = -EROFS;
+                        goto free_kobj;
+                }
                err = recover_fsync_data(sbi);
                if (err) {
                        f2fs_msg(sb, KERN_ERR,
@@ -1144,6 +1178,7 @@ try_onemore:
                if (err)
                        goto free_kobj;
        }
+        kfree(options);
        return 0;
 free_kobj:
@@ -1168,6 +1203,8 @@ free_cp:
 free_meta_inode:
        make_bad_inode(sbi->meta_inode);
        iput(sbi->meta_inode);
+free_options:
+        kfree(options);
 free_sb_buf:
        brelse(raw_super_buf);
 free_sbi:
@@ -1188,11 +1225,18 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
        return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
 }
+static void kill_f2fs_super(struct super_block *sb)
+{
+        if (sb->s_root)
+                set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
+        kill_block_super(sb);
+}
 static struct file_system_type f2fs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "f2fs",
        .mount          = f2fs_mount,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = kill_f2fs_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("f2fs");
@@ -1220,6 +1264,8 @@ static int __init init_f2fs_fs(void)
 {
        int err;
+        f2fs_build_trace_ios();
        err = init_inodecache();
        if (err)
                goto fail;
@@ -1229,12 +1275,9 @@ static int __init init_f2fs_fs(void)
        err = create_segment_manager_caches();
        if (err)
                goto free_node_manager_caches;
-        err = create_gc_caches();
-        if (err)
-                goto free_segment_manager_caches;
        err = create_checkpoint_caches();
        if (err)
-                goto free_gc_caches;
+                goto free_segment_manager_caches;
        f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
        if (!f2fs_kset) {
                err = -ENOMEM;
@@ -1251,8 +1294,6 @@ free_kset:
        kset_unregister(f2fs_kset);
 free_checkpoint_caches:
        destroy_checkpoint_caches();
-free_gc_caches:
-        destroy_gc_caches();
 free_segment_manager_caches:
        destroy_segment_manager_caches();
 free_node_manager_caches:
@@ -1269,11 +1310,11 @@ static void __exit exit_f2fs_fs(void)
        f2fs_destroy_root_stats();
        unregister_filesystem(&f2fs_fs_type);
        destroy_checkpoint_caches();
-        destroy_gc_caches();
        destroy_segment_manager_caches();
        destroy_node_manager_caches();
        destroy_inodecache();
        kset_unregister(f2fs_kset);
+        f2fs_destroy_trace_ios();
 }
 module_init(init_f2fs_fs)
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
new file mode 100644
index 000000000000..875aa8179bc1
--- /dev/null
+++ b/fs/f2fs/trace.c
@@ -0,0 +1,159 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/sched.h>
+#include <linux/radix-tree.h>
+#include "f2fs.h"
+#include "trace.h"
+static RADIX_TREE(pids, GFP_ATOMIC);
+static spinlock_t pids_lock;
+static struct last_io_info last_io;
+static inline void __print_last_io(void)
+{
+        if (!last_io.len)
+                return;
+        trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
+                        last_io.major, last_io.minor,
+                        last_io.pid, "----------------",
+                        last_io.type,
+                        last_io.fio.rw, last_io.fio.blk_addr,
+                        last_io.len);
+        memset(&last_io, 0, sizeof(last_io));
+}
+static int __file_type(struct inode *inode, pid_t pid)
+{
+        if (f2fs_is_atomic_file(inode))
+                return __ATOMIC_FILE;
+        else if (f2fs_is_volatile_file(inode))
+                return __VOLATILE_FILE;
+        else if (S_ISDIR(inode->i_mode))
+                return __DIR_FILE;
+        else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
+                return __NODE_FILE;
+        else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
+                return __META_FILE;
+        else if (pid)
+                return __NORMAL_FILE;
+        else
+                return __MISC_FILE;
+}
+void f2fs_trace_pid(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        pid_t pid = task_pid_nr(current);
+        void *p;
+        page->private = pid;
+        if (radix_tree_preload(GFP_NOFS))
+                return;
+        spin_lock(&pids_lock);
+        p = radix_tree_lookup(&pids, pid);
+        if (p == current)
+                goto out;
+        if (p)
+                radix_tree_delete(&pids, pid);
+        f2fs_radix_tree_insert(&pids, pid, current);
+        trace_printk("%3x:%3x %4x %-16s\n",
+                        MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+                        pid, current->comm);
+out:
+        spin_unlock(&pids_lock);
+        radix_tree_preload_end();
+}
+void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
+{
+        struct inode *inode;
+        pid_t pid;
+        int major, minor;
+        if (flush) {
+                __print_last_io();
+                return;
+        }
+        inode = page->mapping->host;
+        pid = page_private(page);
+        major = MAJOR(inode->i_sb->s_dev);
+        minor = MINOR(inode->i_sb->s_dev);
+        if (last_io.major == major && last_io.minor == minor &&
+                        last_io.pid == pid &&
+                        last_io.type == __file_type(inode, pid) &&
+                        last_io.fio.rw == fio->rw &&
+                        last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+                last_io.len++;
+                return;
+        }
+        __print_last_io();
+        last_io.major = major;
+        last_io.minor = minor;
+        last_io.pid = pid;
+        last_io.type = __file_type(inode, pid);
+        last_io.fio = *fio;
+        last_io.len = 1;
+        return;
+}
+void f2fs_build_trace_ios(void)
+{
+        spin_lock_init(&pids_lock);
+}
+#define PIDVEC_SIZE     128
+static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
+                                                        unsigned int max_items)
+{
+        struct radix_tree_iter iter;
+        void **slot;
+        unsigned int ret = 0;
+        if (unlikely(!max_items))
+                return 0;
+        radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
+                results[ret] = iter.index;
+                if (++ret == PIDVEC_SIZE)
+                        break;
+        }
+        return ret;
+}
+void f2fs_destroy_trace_ios(void)
+{
+        pid_t pid[PIDVEC_SIZE];
+        pid_t next_pid = 0;
+        unsigned int found;
+        spin_lock(&pids_lock);
+        while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
+                unsigned idx;
+                next_pid = pid[found - 1] + 1;
+                for (idx = 0; idx < found; idx++)
+                        radix_tree_delete(&pids, pid[idx]);
+        }
+        spin_unlock(&pids_lock);
+}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
new file mode 100644
index 000000000000..1041dbeb52ae
--- /dev/null
+++ b/fs/f2fs/trace.h
@@ -0,0 +1,46 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_TRACE_H__
+#define __F2FS_TRACE_H__
+#ifdef CONFIG_F2FS_IO_TRACE
+#include <trace/events/f2fs.h>
+enum file_type {
+        __NORMAL_FILE,
+        __DIR_FILE,
+        __NODE_FILE,
+        __META_FILE,
+        __ATOMIC_FILE,
+        __VOLATILE_FILE,
+        __MISC_FILE,
+};
+struct last_io_info {
+        int major, minor;
+        pid_t pid;
+        enum file_type type;
+        struct f2fs_io_info fio;
+        block_t len;
+};
+extern void f2fs_trace_pid(struct page *);
+extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
+extern void f2fs_build_trace_ios(void);
+extern void f2fs_destroy_trace_ios(void);
+#else
+#define f2fs_trace_pid(p)
+#define f2fs_trace_ios(p, i, n)
+#define f2fs_build_trace_ios()
+#define f2fs_destroy_trace_ios()
+#endif
+#endif /* __F2FS_TRACE_H__ */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7b41a2dcdd76..497c7c5263c7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -580,7 +580,7 @@ static void fat_set_state(struct super_block *sb,
 {
        struct buffer_head *bh;
        struct fat_boot_sector *b;
-        struct msdos_sb_info *sbi = sb->s_fs_info;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        /* do not change any thing if mounted read only */
        if ((sb->s_flags & MS_RDONLY) && !force)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..073657f755d4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(writeback_in_progress);
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
-        struct super_block *sb = inode->i_sb;
+        struct super_block *sb;
-        if (sb_is_blkdev_sb(sb))
+        if (!inode)
-                return inode->i_mapping->backing_dev_info;
+                return &noop_backing_dev_info;
+        sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+        if (sb_is_blkdev_sb(sb))
+                return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
        return sb->s_bdi;
 }
+EXPORT_SYMBOL_GPL(inode_to_bdi);
 static inline struct inode *wb_inode(struct list_head *head)
 {
@@ -247,14 +253,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
        return ret;
 }
+#define EXPIRE_DIRTY_ATIME 0x0001
 /*
 * Move expired (dirtied before work->older_than_this) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
 static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
+                               int flags,
                               struct wb_writeback_work *work)
 {
+        unsigned long *older_than_this = NULL;
+        unsigned long expire_time;
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
@@ -262,13 +273,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
        int moved = 0;
+        if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+                older_than_this = work->older_than_this;
+        else if ((work->reason == WB_REASON_SYNC) == 0) {
+                expire_time = jiffies - (HZ * 86400);
+                older_than_this = &expire_time;
+        }
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
-                if (work->older_than_this &&
+                if (older_than_this &&
-                    inode_dirtied_after(inode, *work->older_than_this))
+                    inode_dirtied_after(inode, *older_than_this))
                        break;
                list_move(&inode->i_wb_list, &tmp);
                moved++;
+                if (flags & EXPIRE_DIRTY_ATIME)
+                        set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
                if (sb_is_blkdev_sb(inode->i_sb))
                        continue;
                if (sb && sb != inode->i_sb)
@@ -309,9 +328,12 @@ out:
 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
        int moved;
        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
-        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+                                     EXPIRE_DIRTY_ATIME, work);
        trace_writeback_queue_io(wb, work, moved);
 }
@@ -435,6 +457,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                 * updates after data IO completion.
                 */
                redirty_tail(inode, wb);
+        } else if (inode->i_state & I_DIRTY_TIME) {
+                list_move(&inode->i_wb_list, &wb->b_dirty_time);
        } else {
                /* The inode is clean. Remove from writeback lists. */
                list_del_init(&inode->i_wb_list);
@@ -481,7 +505,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
-        inode->i_state &= ~I_DIRTY;
+        if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+             (inode->i_state & I_DIRTY_TIME)) ||
+            (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+                dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+                trace_writeback_lazytime(inode);
+        }
+        inode->i_state &= ~dirty;
        /*
         * Paired with smp_mb() in __mark_inode_dirty().  This allows
@@ -501,8 +531,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_unlock(&inode->i_lock);
+        if (dirty & I_DIRTY_TIME)
+                mark_inode_dirty_sync(inode);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
-        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
@@ -550,7 +582,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
         * make sure inode is on some writeback list and leave it there unless
         * we have completely cleaned the inode.
         */
-        if (!(inode->i_state & I_DIRTY) &&
+        if (!(inode->i_state & I_DIRTY_ALL) &&
            (wbc->sync_mode != WB_SYNC_ALL ||
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
@@ -565,7 +597,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
         * If inode is clean, remove it from writeback lists. Otherwise don't
         * touch it. See comment above for explanation.
         */
-        if (!(inode->i_state & I_DIRTY))
+        if (!(inode->i_state & I_DIRTY_ALL))
                list_del_init(&inode->i_wb_list);
        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
@@ -707,7 +739,7 @@ static long writeback_sb_inodes(struct super_block *sb,
                wrote += write_chunk - wbc.nr_to_write;
                spin_lock(&wb->list_lock);
                spin_lock(&inode->i_lock);
-                if (!(inode->i_state & I_DIRTY))
+                if (!(inode->i_state & I_DIRTY_ALL))
                        wrote++;
                requeue_inode(inode, wb, &wbc);
                inode_sync_complete(inode);
@@ -1145,16 +1177,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
 */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
        struct super_block *sb = inode->i_sb;
        struct backing_dev_info *bdi = NULL;
+        int dirtytime;
+        trace_writeback_mark_inode_dirty(inode, flags);
        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
         * dirty the inode itself
         */
-        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
                trace_writeback_dirty_inode_start(inode, flags);
                if (sb->s_op->dirty_inode)
@@ -1162,6 +1198,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                trace_writeback_dirty_inode(inode, flags);
        }
+        if (flags & I_DIRTY_INODE)
+                flags &= ~I_DIRTY_TIME;
+        dirtytime = flags & I_DIRTY_TIME;
        /*
         * Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1208,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
         */
        smp_mb();
-        if ((inode->i_state & flags) == flags)
+        if (((inode->i_state & flags) == flags) ||
+            (dirtytime && (inode->i_state & I_DIRTY_INODE)))
                return;
        if (unlikely(block_dump))
                block_dump___mark_inode_dirty(inode);
        spin_lock(&inode->i_lock);
+        if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+                goto out_unlock_inode;
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;
+                if (flags & I_DIRTY_INODE)
+                        inode->i_state &= ~I_DIRTY_TIME;
                inode->i_state |= flags;
                /*
@@ -1225,8 +1269,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+                        list_move(&inode->i_wb_list, dirtytime ?
+                                  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
                        spin_unlock(&bdi->wb.list_lock);
+                        trace_writeback_dirty_inode_enqueue(inode);
                        if (wakeup_bdi)
                                bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 9368236ca100..b06c98796afb 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,78 +1,102 @@
 #include <linux/fs.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/fs_pin.h>
 #include "internal.h"
 #include "mount.h"
-static void pin_free_rcu(struct rcu_head *head)
-{
-        kfree(container_of(head, struct fs_pin, rcu));
-}
 static DEFINE_SPINLOCK(pin_lock);
-void pin_put(struct fs_pin *p)
-{
-        if (atomic_long_dec_and_test(&p->count))
-                call_rcu(&p->rcu, pin_free_rcu);
-}
 void pin_remove(struct fs_pin *pin)
 {
        spin_lock(&pin_lock);
        hlist_del(&pin->m_list);
        hlist_del(&pin->s_list);
        spin_unlock(&pin_lock);
+        spin_lock_irq(&pin->wait.lock);
+        pin->done = 1;
+        wake_up_locked(&pin->wait);
+        spin_unlock_irq(&pin->wait.lock);
 }
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
 {
        spin_lock(&pin_lock);
-        hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+        if (p)
+                hlist_add_head(&pin->s_list, p);
        hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
        spin_unlock(&pin_lock);
 }
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+        pin_insert_group(pin, m, &m->mnt_sb->s_pins);
+}
+void pin_kill(struct fs_pin *p)
+{
+        wait_queue_t wait;
+        if (!p) {
+                rcu_read_unlock();
+                return;
+        }
+        init_wait(&wait);
+        spin_lock_irq(&p->wait.lock);
+        if (likely(!p->done)) {
+                p->done = -1;
+                spin_unlock_irq(&p->wait.lock);
+                rcu_read_unlock();
+                p->kill(p);
+                return;
+        }
+        if (p->done > 0) {
+                spin_unlock_irq(&p->wait.lock);
+                rcu_read_unlock();
+                return;
+        }
+        __add_wait_queue(&p->wait, &wait);
+        while (1) {
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                spin_unlock_irq(&p->wait.lock);
+                rcu_read_unlock();
+                schedule();
+                rcu_read_lock();
+                if (likely(list_empty(&wait.task_list)))
+                        break;
+                /* OK, we know p couldn't have been freed yet */
+                spin_lock_irq(&p->wait.lock);
+                if (p->done > 0) {
+                        spin_unlock_irq(&p->wait.lock);
+                        break;
+                }
+        }
+        rcu_read_unlock();
+}
 void mnt_pin_kill(struct mount *m)
 {
        while (1) {
                struct hlist_node *p;
-                struct fs_pin *pin;
                rcu_read_lock();
                p = ACCESS_ONCE(m->mnt_pins.first);
                if (!p) {
                        rcu_read_unlock();
                        break;
                }
-                pin = hlist_entry(p, struct fs_pin, m_list);
+                pin_kill(hlist_entry(p, struct fs_pin, m_list));
-                if (!atomic_long_inc_not_zero(&pin->count)) {
-                        rcu_read_unlock();
-                        cpu_relax();
-                        continue;
-                }
-                rcu_read_unlock();
-                pin->kill(pin);
        }
 }
-void sb_pin_kill(struct super_block *sb)
+void group_pin_kill(struct hlist_head *p)
 {
        while (1) {
-                struct hlist_node *p;
+                struct hlist_node *q;
-                struct fs_pin *pin;
                rcu_read_lock();
-                p = ACCESS_ONCE(sb->s_pins.first);
+                q = ACCESS_ONCE(p->first);
-                if (!p) {
+                if (!q) {
                        rcu_read_unlock();
                        break;
                }
-                pin = hlist_entry(p, struct fs_pin, s_list);
+                pin_kill(hlist_entry(q, struct fs_pin, s_list));
-                if (!atomic_long_inc_not_zero(&pin->count)) {
-                        rcu_read_unlock();
-                        cpu_relax();
-                        continue;
-                }
-                rcu_read_unlock();
-                pin->kill(pin);
        }
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..c01ec3bdcfd8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        mutex_lock(&inode->i_mutex);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 {
        struct inode *inode = req->inode;
        struct fuse_inode *fi = get_fuse_inode(inode);
-        struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(inode);
        int i;
        list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
        req->end = fuse_writepage_end;
        req->inode = inode;
-        inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
        spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
        if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
                                        old_req->state == FUSE_REQ_PENDING)) {
-                struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+                struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
                copy_highpage(old_req->pages[0], page);
                spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
        req->page_descs[req->num_pages].offset = 0;
        req->page_descs[req->num_pages].length = PAGE_SIZE;
-        inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
        err = 0;
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = fuse_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f38256e4476e..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
                if (!fc->writeback_cache || !S_ISREG(attr->mode))
                        inode->i_flags |= S_NOCMTIME;
                inode->i_generation = generation;
-                inode->i_data.backing_dev_info = &fc->bdi;
                fuse_init_inode(inode, attr);
                unlock_new_inode(inode);
        } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3088e2a38e30..7b3143064af1 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        BUG_ON(name == NULL);
-        if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+        if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
                return -E2BIG;
        if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
                if (!clear_page_dirty_for_io(page))
                        goto continue_unlock;
-                trace_wbc_writepage(wbc, mapping->backing_dev_info);
+                trace_wbc_writepage(wbc, inode_to_bdi(inode));
                ret = __gfs2_jdata_writepage(page, wbc);
                if (unlikely(ret)) {
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f09e228..6371192961e2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
        if (ht == NULL)
-                ht = vzalloc(size);
+                ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
+                               PAGE_KERNEL);
        if (!ht)
                return -ENOMEM;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..3e32bb8e2d7e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = gfs2_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 /**
@@ -655,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 {
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-        int sync_state = inode->i_state & I_DIRTY;
+        int sync_state = inode->i_state & I_DIRTY_ALL;
        struct gfs2_inode *ip = GFS2_I(inode);
        int ret = 0, ret1 = 0;
@@ -668,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
        if (!gfs2_is_jdata(ip))
                sync_state &= ~I_DIRTY_PAGES;
        if (datasync)
-                sync_state &= ~I_DIRTY_SYNC;
+                sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
        if (sync_state) {
                ret = sync_inode_metadata(inode, 1);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..f42dffba056a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
        spin_unlock(&lru_lock);
 }
-static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 {
+        spin_lock(&lru_lock);
        if (!list_empty(&gl->gl_lru)) {
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
                clear_bit(GLF_LRU, &gl->gl_flags);
        }
-}
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
-{
-        spin_lock(&lru_lock);
-        __gfs2_glock_remove_from_lru(gl);
        spin_unlock(&lru_lock);
 }
@@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
        lockref_mark_dead(&gl->gl_lockref);
-        spin_lock(&lru_lock);
+        gfs2_glock_remove_from_lru(gl);
-        __gfs2_glock_remove_from_lru(gl);
-        spin_unlock(&lru_lock);
        spin_unlock(&gl->gl_lockref.lock);
        spin_lock_bucket(gl->gl_hash);
        hlist_bl_del_rcu(&gl->gl_list);
@@ -775,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_NOFS);
                mapping->private_data = NULL;
-                mapping->backing_dev_info = s->s_bdi;
                mapping->writeback_index = 0;
        }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002ebe70..73c72253faac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        }
        error = gfs2_dir_add(&dip->i_inode, name, ip, da);
-        if (error)
-                goto fail_end_trans;
-fail_end_trans:
        gfs2_trans_end(sdp);
 fail_ipreserv:
        gfs2_inplace_release(dip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = sb->s_bdi;
        mapping->writeback_index = 0;
        spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e193cb36996..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
 }
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *dispose = arg;
        struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
        if (qd->qd_lockref.count == 0) {
                lockref_mark_dead(&qd->qd_lockref);
-                list_move(&qd->qd_lru, dispose);
+                list_lru_isolate_move(lru, &qd->qd_lru, dispose);
        }
        spin_unlock(&qd->qd_lockref.lock);
@@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
        if (!(sc->gfp_mask & __GFP_FS))
                return SHRINK_STOP;
-        freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
+        freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
-                                   &dispose, &sc->nr_to_scan);
+                                     gfs2_qd_isolate, &dispose);
        gfs2_qd_dispose(&dispose);
@@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
                                          struct shrink_control *sc)
 {
-        return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
+        return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 struct shrinker gfs2_qd_shrinker = {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b758fa..1b645773c98e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        ls->ls_recover_jid_done = jid;
        ls->ls_recover_jid_status = message;
-        sprintf(env_jid, "JID=%d", jid);
+        sprintf(env_jid, "JID=%u", jid);
        sprintf(env_status, "RECOVERY=%s",
                message == LM_RD_SUCCESS ? "Done" : "Failed");
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
-        struct backing_dev_info *bdi = metamapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
        int ret = 0;
        if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566ba5696..ae8e8811f0e8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
        struct super_block *sb = sdp->sd_vfs;
        int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
-        return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
+        return snprintf(buf, PAGE_SIZE, "%d\n", frozen);
 }
 static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f593f8..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
 }
-static struct backing_dev_info hugetlbfs_backing_dev_info = {
-        .name           = "hugetlbfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 int sysctl_hugetlb_shm_group;
 enum {
@@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
                                &hugetlbfs_i_mmap_rwsem_key);
                inode->i_mapping->a_ops = &hugetlbfs_aops;
-                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_mapping->private_data = resv_map;
                info = HUGETLBFS_I(inode);
@@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void)
                return -ENOTSUPP;
        }
-        error = bdi_init(&hugetlbfs_backing_dev_info);
-        if (error)
-                return error;
        error = -ENOMEM;
        hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
                                        sizeof(struct hugetlbfs_inode_info),
@@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void)
 out:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
 out2:
-        bdi_destroy(&hugetlbfs_backing_dev_info);
        return error;
 }
@@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void)
        for_each_hstate(h)
                kern_unmount(hugetlbfs_vfsmount[i++]);
        unregister_filesystem(&hugetlbfs_fs_type);
-        bdi_destroy(&hugetlbfs_backing_dev_info);
 }
 module_init(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..f00b16f45507 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
+#include <trace/events/writeback.h>
 #include "internal.h"
 /*
@@ -30,7 +31,7 @@
 * inode_sb_list_lock protects:
 *   sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
- *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
@@ -170,20 +171,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        atomic_set(&mapping->i_mmap_writable, 0);
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
-        /*
-         * If the block_device provides a backing_dev_info for client
-         * inodes then use that.  Otherwise the inode share the bdev's
-         * backing_dev_info.
-         */
-        if (sb->s_bdev) {
-                struct backing_dev_info *bdi;
-                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                mapping->backing_dev_info = bdi;
-        }
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);      /* buggered by rcu freeing */
@@ -194,7 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
 #endif
+        inode->i_flctx = NULL;
        this_cpu_inc(nr_inodes);
        return 0;
@@ -237,6 +225,7 @@ void __destroy_inode(struct inode *inode)
        BUG_ON(inode_has_buffers(inode));
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
+        locks_free_lock_context(inode->i_flctx);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -355,7 +344,6 @@ void address_space_init_once(struct address_space *mapping)
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        mapping->i_mmap = RB_ROOT;
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
@@ -416,7 +404,8 @@ static void inode_lru_list_add(struct inode *inode)
 */
 void inode_add_lru(struct inode *inode)
 {
-        if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+        if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+                                I_FREEING | I_WILL_FREE)) &&
            !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
                inode_lru_list_add(inode);
 }
@@ -647,7 +636,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
                        spin_unlock(&inode->i_lock);
                        continue;
                }
-                if (inode->i_state & I_DIRTY && !kill_dirty) {
+                if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
@@ -685,8 +674,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
-static enum lru_status
+static enum lru_status inode_lru_isolate(struct list_head *item,
-inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *freeable = arg;
        struct inode    *inode = container_of(item, struct inode, i_lru);
@@ -704,7 +693,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
         */
        if (atomic_read(&inode->i_count) ||
            (inode->i_state & ~I_REFERENCED)) {
-                list_del_init(&inode->i_lru);
+                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
@@ -738,7 +727,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        list_move(&inode->i_lru, freeable);
+        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);
        this_cpu_dec(nr_unused);
@@ -751,14 +740,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
-                     int nid)
 {
        LIST_HEAD(freeable);
        long freed;
-        freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
+        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
-                                       &freeable, &nr_to_scan);
+                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
 }
@@ -1282,6 +1270,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 }
 EXPORT_SYMBOL(ilookup);
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb:         super block of file system to search
+ * @hashval:    hash value (usually inode number) to search for
+ * @match:      callback used for comparisons between inodes
+ * @data:       opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped.  The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1.  It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction.  The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+                                unsigned long hashval,
+                                int (*match)(struct inode *, unsigned long,
+                                             void *),
+                                void *data)
+{
+        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *inode, *ret_inode = NULL;
+        int mval;
+        spin_lock(&inode_hash_lock);
+        hlist_for_each_entry(inode, head, i_hash) {
+                if (inode->i_sb != sb)
+                        continue;
+                mval = match(inode, hashval, data);
+                if (mval == 0)
+                        continue;
+                if (mval == 1)
+                        ret_inode = inode;
+                goto out;
+        }
+out:
+        spin_unlock(&inode_hash_lock);
+        return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
 int insert_inode_locked(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
@@ -1432,11 +1470,20 @@ static void iput_final(struct inode *inode)
 */
 void iput(struct inode *inode)
 {
-        if (inode) {
+        if (!inode)
-                BUG_ON(inode->i_state & I_CLEAR);
+                return;
+        BUG_ON(inode->i_state & I_CLEAR);
-                if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
+retry:
-                        iput_final(inode);
+        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+                        atomic_inc(&inode->i_count);
+                        inode->i_state &= ~I_DIRTY_TIME;
+                        spin_unlock(&inode->i_lock);
+                        trace_writeback_lazytime_iput(inode);
+                        mark_inode_dirty_sync(inode);
+                        goto retry;
+                }
+                iput_final(inode);
        }
 }
 EXPORT_SYMBOL(iput);
@@ -1495,14 +1542,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
        return 0;
 }
-/*
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
- * This does the actual work of updating an inodes time or version.  Must have
- * had called mnt_want_write() before calling this.
- */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
 {
-        if (inode->i_op->update_time)
+        int iflags = I_DIRTY_TIME;
-                return inode->i_op->update_time(inode, time, flags);
        if (flags & S_ATIME)
                inode->i_atime = *time;
@@ -1512,9 +1554,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
                inode->i_ctime = *time;
        if (flags & S_MTIME)
                inode->i_mtime = *time;
-        mark_inode_dirty_sync(inode);
+        if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+                iflags |= I_DIRTY_SYNC;
+        __mark_inode_dirty(inode, iflags);
        return 0;
 }
+EXPORT_SYMBOL(generic_update_time);
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+        int (*update_time)(struct inode *, struct timespec *, int);
+        update_time = inode->i_op->update_time ? inode->i_op->update_time :
+                generic_update_time;
+        return update_time(inode, time, flags);
+}
 /**
 *      touch_atime     -       update the access time
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..30459dab409d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
 struct linux_binprm;
 struct path;
 struct mount;
+struct shrink_control;
 /*
 * block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
 * inode.c
 */
 extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-                            int nid);
 extern void inode_add_lru(struct inode *inode);
 /*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
 */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
-                            int nid);
 /*
 * read_write.c
@@ -145,7 +144,7 @@ extern const struct file_operations pipefifo_fops;
 /*
 * fs_pin.c
 */
-extern void sb_pin_kill(struct super_block *sb);
+extern void group_pin_kill(struct hlist_head *p);
 extern void mnt_pin_kill(struct mount *m);
 /*
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 214c3c11fbc2..5d01d2638ca5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode,
                                past_eof = true;
                }
                cond_resched();
+                if (fatal_signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
        } while (1);
        /* If ret is 1 then we just hit the end of the extent array */
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index 01e1ee7a998b..005a15cfd30a 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -2,6 +2,7 @@
 *  linux/fs/isofs/util.c
 */
+#include <linux/time.h>
 #include "isofs.h"
 /* 
@@ -17,9 +18,9 @@
 int iso_date(char * p, int flag)
 {
        int year, month, day, hour, minute, second, tz;
-        int crtime, days, i;
+        int crtime;
-        year = p[0] - 70;
+        year = p[0];
        month = p[1];
        day = p[2];
        hour = p[3];
@@ -31,18 +32,7 @@ int iso_date(char * p, int flag)
        if (year < 0) {
                crtime = 0;
        } else {
-                int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31};
+                crtime = mktime64(year+1900, month, day, hour, minute, second);
-                days = year * 365;
-                if (year > 2)
-                        days += (year+1) / 4;
-                for (i = 1; i < month; i++)
-                        days += monlen[i-1];
-                if (((year+2) % 4) == 0 && month > 2)
-                        days++;
-                days += day - 1;
-                crtime = ((((days * 24) + hour) * 60 + minute) * 60)
-                        + second;
                /* sign extend */
                if (tz & 0x80)
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 92e0644bf867..556de100ebd5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp)
        return bit;
 }
-static inline int pulledbits(struct pushpull *pp)
-{
-        return pp->ofs;
-}
 static void init_rubin(struct rubin_state *rs, int div, int *bits)
 {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7654e87b0428..9ad5ba4b299b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
                                sumlen = c->sector_size - je32_to_cpu(sm->offset);
                                sumptr = buf + buf_size - sumlen;
+                                /* sm->offset maybe wrong but MAGIC maybe right */
+                                if (sumlen > c->sector_size)
+                                        goto full_scan;
                                /* Now, make sure the summary itself is available */
                                if (sumlen > buf_size) {
                                        /* Need to kmalloc for this. */
@@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
                }
        }
+full_scan:
        buf_ofs = jeb->offset;
        if (!buf_size) {
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
deleted file mode 100644
index fa92f7f1d0d0..000000000000
--- a/fs/jfs/endian24.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *   Copyright (C) International Business Machines Corp., 2001
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#ifndef _H_ENDIAN24
-#define _H_ENDIAN24
-/*
- *      endian24.h:
- *
- * Endian conversion for 24-byte data
- *
- */
-#define __swab24(x) \
-({ \
-        __u32 __x = (x); \
-        ((__u32)( \
-                ((__x & (__u32)0x000000ffUL) << 16) | \
-                 (__x & (__u32)0x0000ff00UL)        | \
-                ((__x & (__u32)0x00ff0000UL) >> 16) )); \
-})
-#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
-        #define __cpu_to_le24(x) ((__u32)(x))
-        #define __le24_to_cpu(x) ((__u32)(x))
-#else
-        #define __cpu_to_le24(x) __swab24(x)
-        #define __le24_to_cpu(x) __swab24(x)
-#endif
-#ifdef __KERNEL__
-        #define cpu_to_le24 __cpu_to_le24
-        #define le24_to_cpu __le24_to_cpu
-#endif
-#endif                          /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                return rc;
        mutex_lock(&inode->i_mutex);
-        if (!(inode->i_state & I_DIRTY) ||
+        if (!(inode->i_state & I_DIRTY_ALL) ||
            (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
                /* Make sure committed changes hit the disk */
                jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 984c2bbf4f61..d88576e23fe4 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -1040,8 +1040,8 @@ static int dtSplitUp(tid_t tid,
                pxdlist.maxnpxd = 1;
                pxdlist.npxd = 0;
                pxd = &pxdlist.pxd[0];
-                PXDaddress(pxd, nxaddr)
+                PXDaddress(pxd, nxaddr);
-                    PXDlength(pxd, xlen + n);
+                PXDlength(pxd, xlen + n);
                split->pxdlist = &pxdlist;
                if ((rc = dtExtendPage(tid, ip, split, btstack))) {
                        nxaddr = addressPXD(pxd);
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 43ea3713c083..8f602dcb51fa 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -30,8 +30,6 @@
 #include <linux/types.h>
 #include <linux/nls.h>
-#include "endian24.h"
 /*
 * transaction and lock id's
 *
@@ -59,26 +57,42 @@ struct timestruc_t {
 /*
 *      physical xd (pxd)
+ *
+ *      The leftmost 24 bits of len_addr are the extent length.
+ *      The rightmost 8 bits of len_addr are the most signficant bits of
+ *      the extent address
 */
 typedef struct {
-        unsigned len:24;
+        __le32 len_addr;
-        unsigned addr1:8;
        __le32 addr2;
 } pxd_t;
 /* xd_t field construction */
-#define PXDlength(pxd, length32)        ((pxd)->len = __cpu_to_le24(length32))
+static inline void PXDlength(pxd_t *pxd, __u32 len)
-#define PXDaddress(pxd, address64)\
+{
-{\
+        pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) |
-        (pxd)->addr1 = ((s64)address64) >> 32;\
+                        cpu_to_le32(len & 0xffffff);
-        (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+static inline void PXDaddress(pxd_t *pxd, __u64 addr)
+{
+        pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) |
+                        cpu_to_le32((addr >> 32)<<24);
+        pxd->addr2 = cpu_to_le32(addr & 0xffffffff);
 }
 /* xd_t field extraction */
-#define lengthPXD(pxd)  __le24_to_cpu((pxd)->len)
+static inline __u32 lengthPXD(pxd_t *pxd)
-#define addressPXD(pxd)\
+{
-        ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2))
+        return le32_to_cpu((pxd)->len_addr) & 0xffffff;
+}
+static inline __u64 addressPXD(pxd_t *pxd)
+{
+        __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff;
+        return (n << 8) + le32_to_cpu(pxd->addr2);
+}
 #define MAXTREEHEIGHT 8
 /* pxd list */
@@ -93,12 +107,10 @@ struct pxdlist {
 *      data extent descriptor (dxd)
 */
 typedef struct {
-        unsigned flag:8;        /* 1: flags */
+        __u8 flag;      /* 1: flags */
-        unsigned rsrvd:24;
+        __u8 rsrvd[3];
        __le32 size;            /* 4: size in byte */
-        unsigned len:24;        /* 3: length in unit of fsblksize */
+        pxd_t loc;              /* 8: address and length in unit of fsblksize */
-        unsigned addr1:8;       /* 1: address in unit of fsblksize */
-        __le32 addr2;           /* 4: address in unit of fsblksize */
 } dxd_t;                        /* - 16 - */
 /* dxd_t flags */
@@ -109,12 +121,11 @@ typedef struct {
 #define DXD_CORRUPT     0x08    /* Inconsistency detected */
 /* dxd_t field construction
- *      Conveniently, the PXD macros work for DXD
 */
-#define DXDlength       PXDlength
+#define DXDlength(dxd, len)     PXDlength(&(dxd)->loc, len)
-#define DXDaddress      PXDaddress
+#define DXDaddress(dxd, addr)   PXDaddress(&(dxd)->loc, addr)
-#define lengthDXD       lengthPXD
+#define lengthDXD(dxd)  lengthPXD(&(dxd)->loc)
-#define addressDXD      addressPXD
+#define addressDXD(dxd) addressPXD(&(dxd)->loc)
 #define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
 #define sizeDXD(dxd)    le32_to_cpu((dxd)->size)
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 08c0c749b986..1e0987986d5f 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -29,13 +29,11 @@
 *      extent allocation descriptor (xad)
 */
 typedef struct xad {
-        unsigned flag:8;        /* 1: flag */
+        __u8 flag;      /* 1: flag */
-        unsigned rsvrd:16;      /* 2: reserved */
+        __u8 rsvrd[2];  /* 2: reserved */
-        unsigned off1:8;        /* 1: offset in unit of fsblksize */
+        __u8 off1;      /* 1: offset in unit of fsblksize */
-        __le32 off2;            /* 4: offset in unit of fsblksize */
+        __le32 off2;    /* 4: offset in unit of fsblksize */
-        unsigned len:24;        /* 3: length in unit of fsblksize */
+        pxd_t loc;      /* 8: length and address in unit of fsblksize */
-        unsigned addr1:8;       /* 1: address in unit of fsblksize */
-        __le32 addr2;           /* 4: address in unit of fsblksize */
 } xad_t;                        /* (16) */
 #define MAXXLEN         ((1 << 24) - 1)
@@ -49,19 +47,14 @@ typedef struct xad {
        (xad)->off1 = ((u64)offset64) >> 32;\
        (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
 }
-#define XADaddress(xad, address64)\
+#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64)
-{\
+#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32)
-        (xad)->addr1 = ((u64)address64) >> 32;\
-        (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
-}
-#define XADlength(xad, length32)        (xad)->len = __cpu_to_le24(length32)
 /* xad_t field extraction */
 #define offsetXAD(xad)\
        ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
-#define addressXAD(xad)\
+#define addressXAD(xad) addressPXD(&(xad)->loc)
-        ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
+#define lengthXAD(xad) lengthPXD(&(xad)->loc)
-#define lengthXAD(xad)  __le24_to_cpu((xad)->len)
 /* xad list */
 struct xadlist {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 16c3a9556634..5d30c56ae075 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -619,8 +619,7 @@ out_mount_failed:
        iput(sbi->direct_inode);
        sbi->direct_inode = NULL;
 out_unload:
-        if (sbi->nls_tab)
+        unload_nls(sbi->nls_tab);
-                unload_nls(sbi->nls_tab);
 out_kfree:
        kfree(sbi);
        return ret;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d881b381d2b..6acc9648f986 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -411,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn)
        if (kernfs_type(kn) == KERNFS_LINK)
                kernfs_put(kn->symlink.target_kn);
-        if (!(kn->flags & KERNFS_STATIC_NAME))
-                kfree(kn->name);
+        kfree_const(kn->name);
        if (kn->iattr) {
                if (kn->iattr->ia_secdata)
                        security_release_secctx(kn->iattr->ia_secdata,
@@ -506,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                                             const char *name, umode_t mode,
                                             unsigned flags)
 {
-        char *dup_name = NULL;
        struct kernfs_node *kn;
        int ret;
-        if (!(flags & KERNFS_STATIC_NAME)) {
+        name = kstrdup_const(name, GFP_KERNEL);
-                name = dup_name = kstrdup(name, GFP_KERNEL);
+        if (!name)
-                if (!name)
+                return NULL;
-                        return NULL;
-        }
        kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
        if (!kn)
@@ -538,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 err_out2:
        kmem_cache_free(kernfs_node_cache, kn);
 err_out1:
-        kfree(dup_name);
+        kfree_const(name);
        return NULL;
 }
@@ -1264,7 +1262,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
        /* rename kernfs_node */
        if (strcmp(kn->name, new_name) != 0) {
                error = -ENOMEM;
-                new_name = kstrdup(new_name, GFP_KERNEL);
+                new_name = kstrdup_const(new_name, GFP_KERNEL);
                if (!new_name)
                        goto out;
        } else {
@@ -1285,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
        kn->ns = new_ns;
        if (new_name) {
-                if (!(kn->flags & KERNFS_STATIC_NAME))
+                old_name = kn->name;
-                        old_name = kn->name;
-                kn->flags &= ~KERNFS_STATIC_NAME;
                kn->name = new_name;
        }
@@ -1297,7 +1293,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
        kernfs_link_sibling(kn);
        kernfs_put(old_parent);
-        kfree(old_name);
+        kfree_const(old_name);
        error = 0;
 out:
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddc9f9612f16..b684e8a132e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = {
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
- * @name_is_static: don't copy file name
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Returns the created node on success, ERR_PTR() value on error.
@@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         umode_t mode, loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const void *ns,
-                                         bool name_is_static,
                                         struct lock_class_key *key)
 {
        struct kernfs_node *kn;
@@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
        int rc;
        flags = KERNFS_FILE;
-        if (name_is_static)
-                flags |= KERNFS_STATIC_NAME;
        kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
        if (!kn)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 985217626e66..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = {
        .write_end      = simple_write_end,
 };
-static struct backing_dev_info kernfs_bdi = {
-        .name           = "kernfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static const struct inode_operations kernfs_iops = {
        .permission     = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
@@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = {
        .listxattr      = kernfs_iop_listxattr,
 };
-void __init kernfs_inode_init(void)
-{
-        if (bdi_init(&kernfs_bdi))
-                panic("failed to init kernfs_bdi");
-}
 static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 {
        static DEFINE_MUTEX(iattr_mutex);
@@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
        kernfs_get(kn);
        inode->i_private = kn;
        inode->i_mapping->a_ops = &kernfs_aops;
-        inode->i_mapping->backing_dev_info = &kernfs_bdi;
        inode->i_op = &kernfs_iops;
        set_default_inode_attr(inode, kn->mode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dc84a3ef9ca2..af9fa7499919 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
                            size_t size);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
-void kernfs_inode_init(void);
 /*
 * dir.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9b05f1..8eaf417187f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -246,5 +246,4 @@ void __init kernfs_init(void)
        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
                                              sizeof(struct kernfs_node),
                                              0, SLAB_PANIC, NULL);
-        kernfs_inode_init();
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
        mutex_lock(&inode->i_mutex);
        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
+        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1cc6ec51e6b1..47a32b6d9b90 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
        return (struct sockaddr *)&nsm->sm_addr;
 }
-static struct rpc_clnt *nsm_create(struct net *net)
+static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
 {
        struct sockaddr_in sin = {
                .sin_family             = AF_INET,
@@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
                .address                = (struct sockaddr *)&sin,
                .addrsize               = sizeof(sin),
                .servername             = "rpc.statd",
+                .nodename               = nodename,
                .program                = &nsm_program,
                .version                = NSM_VERSION,
                .authflavor             = RPC_AUTH_NULL,
@@ -102,7 +103,7 @@ out:
        return clnt;
 }
-static struct rpc_clnt *nsm_client_get(struct net *net)
+static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
 {
        struct rpc_clnt *clnt, *new;
        struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net)
        if (clnt != NULL)
                goto out;
-        clnt = new = nsm_create(net);
+        clnt = new = nsm_create(net, nodename);
        if (IS_ERR(clnt))
                goto out;
@@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host)
        struct nsm_res  res;
        int             status;
        struct rpc_clnt *clnt;
+        const char *nodename = NULL;
        dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
        if (nsm->sm_monitored)
                return 0;
+        if (host->h_rpcclnt)
+                nodename = host->h_rpcclnt->cl_nodename;
        /*
         * Choose whether to record the caller_name or IP address of
         * this peer in the local rpc.statd's database.
         */
        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
-        clnt = nsm_client_get(host->net);
+        clnt = nsm_client_get(host->net, nodename);
        if (IS_ERR(clnt)) {
                status = PTR_ERR(clnt);
                dprintk("lockd: failed to create NSM upcall transport, "
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 {
        /*
-         * We can get away with a static buffer because we're only
+         * We can get away with a static buffer because this is only called
-         * called with BKL held.
+         * from lockd, which is single-threaded.
         */
        static char buf[2*NLM_MAXCOOKIELEN+1];
        unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 {
        struct inode     *inode = nlmsvc_file_inode(file);
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
        struct nlm_host  *lockhost;
+        if (!flctx || list_empty_careful(&flctx->flc_posix))
+                return 0;
 again:
        file->f_locks = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+        list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
                if (fl->fl_lmops != &nlmsvc_lock_operations)
                        continue;
@@ -180,7 +183,7 @@ again:
                if (match(lockhost, host)) {
                        struct file_lock lock = *fl;
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&flctx->flc_lock);
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -192,7 +195,7 @@ again:
                        goto again;
                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        return 0;
 }
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
 {
        struct inode     *inode = nlmsvc_file_inode(file);
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
                return 1;
-        spin_lock(&inode->i_lock);
+        if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+                spin_lock(&flctx->flc_lock);
-                if (fl->fl_lmops == &nlmsvc_lock_operations) {
+                list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-                        spin_unlock(&inode->i_lock);
+                        if (fl->fl_lmops == &nlmsvc_lock_operations) {
-                        return 1;
+                                spin_unlock(&flctx->flc_lock);
+                                return 1;
+                        }
                }
+                spin_unlock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
        file->f_locks = 0;
        return 0;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
-static inline __be32 *
-nlm_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(NFS2_FHSIZE);
-        memcpy(p, f->data, NFS2_FHSIZE);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
-}
 /*
 * Encode and decode owner handle
 */
diff --git a/fs/locks.c b/fs/locks.c
index 59e2f905e4ff..365c82e1b3a9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
 #define IS_POSIX(fl)    (fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)    (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)    (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl)    (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)   (fl->fl_flags & FL_OFDLCK)
 static bool lease_breaking(struct file_lock *fl)
@@ -157,14 +157,11 @@ static int target_leasetype(struct file_lock *fl)
 int leases_enable = 1;
 int lease_break_time = 45;
-#define for_each_lock(inode, lockp) \
-        for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
 /*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock via
 * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant i_lock is held.
+ * the relevant flc_lock is held.
 */
 DEFINE_STATIC_LGLOCK(file_lock_lglock);
 static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -192,21 +189,68 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
 * contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
- * we often hold the i_lock as well. In certain cases, when reading the fields
+ * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
- * i_lock.
+ * flc_lock.
 *
 * In particular, adding an entry to the fl_block list requires that you hold
- * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
+ * both the flc_lock and the blocked_lock_lock (acquired in that order).
- * an entry from the list however only requires the file_lock_lock.
+ * Deleting an entry from the list however only requires the file_lock_lock.
 */
 static DEFINE_SPINLOCK(blocked_lock_lock);
+static struct kmem_cache *flctx_cache __read_mostly;
 static struct kmem_cache *filelock_cache __read_mostly;
+static struct file_lock_context *
+locks_get_lock_context(struct inode *inode)
+{
+        struct file_lock_context *new;
+        if (likely(inode->i_flctx))
+                goto out;
+        new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+        if (!new)
+                goto out;
+        spin_lock_init(&new->flc_lock);
+        INIT_LIST_HEAD(&new->flc_flock);
+        INIT_LIST_HEAD(&new->flc_posix);
+        INIT_LIST_HEAD(&new->flc_lease);
+        /*
+         * Assign the pointer if it's not already assigned. If it is, then
+         * free the context we just allocated.
+         */
+        spin_lock(&inode->i_lock);
+        if (likely(!inode->i_flctx)) {
+                inode->i_flctx = new;
+                new = NULL;
+        }
+        spin_unlock(&inode->i_lock);
+        if (new)
+                kmem_cache_free(flctx_cache, new);
+out:
+        return inode->i_flctx;
+}
+void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+        if (ctx) {
+                WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+                WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
+                WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+                kmem_cache_free(flctx_cache, ctx);
+        }
+}
 static void locks_init_lock_heads(struct file_lock *fl)
 {
        INIT_HLIST_NODE(&fl->fl_link);
+        INIT_LIST_HEAD(&fl->fl_list);
        INIT_LIST_HEAD(&fl->fl_block);
        init_waitqueue_head(&fl->fl_wait);
 }
@@ -243,6 +287,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
 void locks_free_lock(struct file_lock *fl)
 {
        BUG_ON(waitqueue_active(&fl->fl_wait));
+        BUG_ON(!list_empty(&fl->fl_list));
        BUG_ON(!list_empty(&fl->fl_block));
        BUG_ON(!hlist_unhashed(&fl->fl_link));
@@ -257,8 +302,8 @@ locks_dispose_list(struct list_head *dispose)
        struct file_lock *fl;
        while (!list_empty(dispose)) {
-                fl = list_first_entry(dispose, struct file_lock, fl_block);
+                fl = list_first_entry(dispose, struct file_lock, fl_list);
-                list_del_init(&fl->fl_block);
+                list_del_init(&fl->fl_list);
                locks_free_lock(fl);
        }
 }
@@ -513,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
        return fl1->fl_owner == fl2->fl_owner;
 }
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
        lg_local_lock(&file_lock_lglock);
@@ -522,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
        lg_local_unlock(&file_lock_lglock);
 }
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
        /*
         * Avoid taking lock if already unhashed. This is safe since this check
-         * is done while holding the i_lock, and new insertions into the list
+         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&fl->fl_link))
@@ -579,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
- * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
+ * Must be called with both the flc_lock and blocked_lock_lock held. The
- * list itself is protected by the blocked_lock_lock, but by ensuring that the
+ * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
- * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
+ * that the flc_lock is also held on insertions we can avoid taking the
- * in some cases when we see that the fl_block list is empty.
+ * blocked_lock_lock in some cases when we see that the fl_block list is empty.
 */
 static void __locks_insert_block(struct file_lock *blocker,
                                        struct file_lock *waiter)
@@ -594,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
                locks_insert_global_blocked(waiter);
 }
-/* Must be called with i_lock held. */
+/* Must be called with flc_lock held. */
 static void locks_insert_block(struct file_lock *blocker,
                                        struct file_lock *waiter)
 {
@@ -606,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
 /*
 * Wake up processes blocked waiting for blocker.
 *
- * Must be called with the inode->i_lock held!
+ * Must be called with the inode->flc_lock held!
 */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
        /*
         * Avoid taking global lock if list is empty. This is safe since new
-         * blocked requests are only added to the list under the i_lock, and
+         * blocked requests are only added to the list under the flc_lock, and
-         * the i_lock is always held here. Note that removal from the fl_block
+         * the flc_lock is always held here. Note that removal from the fl_block
-         * list does not require the i_lock, so we must recheck list_empty()
+         * list does not require the flc_lock, so we must recheck list_empty()
         * after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->fl_block))
@@ -635,63 +680,32 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
        spin_unlock(&blocked_lock_lock);
 }
-/* Insert file lock fl into an inode's lock list at the position indicated
+static void
- * by pos. At the same time add the lock to the global file lock list.
+locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
- *
- * Must be called with the i_lock held!
- */
-static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 {
        fl->fl_nspid = get_pid(task_tgid(current));
+        list_add_tail(&fl->fl_list, before);
-        /* insert into file's list */
-        fl->fl_next = *pos;
-        *pos = fl;
        locks_insert_global_locks(fl);
 }
-/**
+static void
- * locks_delete_lock - Delete a lock and then free it.
+locks_unlink_lock_ctx(struct file_lock *fl)
- * @thisfl_p: pointer that points to the fl_next field of the previous
- *            inode->i_flock list entry
- *
- * Unlink a lock from all lists and free the namespace reference, but don't
- * free it yet. Wake up processes that are blocked waiting for this lock and
- * notify the FS that the lock has been cleared.
- *
- * Must be called with the i_lock held!
- */
-static void locks_unlink_lock(struct file_lock **thisfl_p)
 {
-        struct file_lock *fl = *thisfl_p;
        locks_delete_global_locks(fl);
+        list_del_init(&fl->fl_list);
-        *thisfl_p = fl->fl_next;
-        fl->fl_next = NULL;
        if (fl->fl_nspid) {
                put_pid(fl->fl_nspid);
                fl->fl_nspid = NULL;
        }
        locks_wake_up_blocks(fl);
 }
-/*
+static void
- * Unlink a lock from all lists and free it.
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
- *
- * Must be called with i_lock held!
- */
-static void locks_delete_lock(struct file_lock **thisfl_p,
-                              struct list_head *dispose)
 {
-        struct file_lock *fl = *thisfl_p;
+        locks_unlink_lock_ctx(fl);
-        locks_unlink_lock(thisfl_p);
        if (dispose)
-                list_add(&fl->fl_block, dispose);
+                list_add(&fl->fl_list, dispose);
        else
                locks_free_lock(fl);
 }
@@ -746,22 +760,27 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
        struct file_lock *cfl;
+        struct file_lock_context *ctx;
        struct inode *inode = file_inode(filp);
-        spin_lock(&inode->i_lock);
+        ctx = inode->i_flctx;
-        for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
+        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
-                if (!IS_POSIX(cfl))
-                        continue;
-                if (posix_locks_conflict(fl, cfl))
-                        break;
-        }
-        if (cfl) {
-                locks_copy_conflock(fl, cfl);
-                if (cfl->fl_nspid)
-                        fl->fl_pid = pid_vnr(cfl->fl_nspid);
-        } else
                fl->fl_type = F_UNLCK;
-        spin_unlock(&inode->i_lock);
+                return;
+        }
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+                if (posix_locks_conflict(fl, cfl)) {
+                        locks_copy_conflock(fl, cfl);
+                        if (cfl->fl_nspid)
+                                fl->fl_pid = pid_vnr(cfl->fl_nspid);
+                        goto out;
+                }
+        }
+        fl->fl_type = F_UNLCK;
+out:
+        spin_unlock(&ctx->flc_lock);
        return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -845,34 +864,34 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
 static int flock_lock_file(struct file *filp, struct file_lock *request)
 {
        struct file_lock *new_fl = NULL;
-        struct file_lock **before;
+        struct file_lock *fl;
-        struct inode * inode = file_inode(filp);
+        struct file_lock_context *ctx;
+        struct inode *inode = file_inode(filp);
        int error = 0;
-        int found = 0;
+        bool found = false;
        LIST_HEAD(dispose);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        if (request->fl_flags & FL_ACCESS)
                goto find_conflict;
-        for_each_lock(inode, before) {
+        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-                struct file_lock *fl = *before;
-                if (IS_POSIX(fl))
-                        break;
-                if (IS_LEASE(fl))
-                        continue;
                if (filp != fl->fl_file)
                        continue;
                if (request->fl_type == fl->fl_type)
                        goto out;
-                found = 1;
+                found = true;
-                locks_delete_lock(before, &dispose);
+                locks_delete_lock_ctx(fl, &dispose);
                break;
        }
@@ -882,23 +901,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
                goto out;
        }
-        /*
-         * If a higher-priority process was blocked on the old file lock,
-         * give it the opportunity to lock the file.
-         */
-        if (found) {
-                spin_unlock(&inode->i_lock);
-                cond_resched();
-                spin_lock(&inode->i_lock);
-        }
 find_conflict:
-        for_each_lock(inode, before) {
+        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-                struct file_lock *fl = *before;
-                if (IS_POSIX(fl))
-                        break;
-                if (IS_LEASE(fl))
-                        continue;
                if (!flock_locks_conflict(request, fl))
                        continue;
                error = -EAGAIN;
@@ -911,12 +915,12 @@ find_conflict:
        if (request->fl_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
-        locks_insert_lock(before, new_fl);
+        locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
@@ -925,16 +929,20 @@ out:
 static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
-        struct file_lock *fl;
+        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
-        struct file_lock **before;
+        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
@@ -948,15 +956,14 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                new_fl2 = locks_alloc_lock();
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->fl_type != F_UNLCK) {
-                for_each_lock(inode, before) {
+                list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-                        fl = *before;
                        if (!IS_POSIX(fl))
                                continue;
                        if (!posix_locks_conflict(request, fl))
@@ -986,29 +993,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
        if (request->fl_flags & FL_ACCESS)
                goto out;
-        /*
+        /* Find the first old lock with the same owner as the new lock */
-         * Find the first old lock with the same owner as the new lock.
+        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-         */
+                if (posix_same_owner(request, fl))
-        
+                        break;
-        before = &inode->i_flock;
-        /* First skip locks owned by other processes.  */
-        while ((fl = *before) && (!IS_POSIX(fl) ||
-                                  !posix_same_owner(request, fl))) {
-                before = &fl->fl_next;
        }
        /* Process locks with this owner. */
-        while ((fl = *before) && posix_same_owner(request, fl)) {
+        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
-                /* Detect adjacent or overlapping regions (if same lock type)
+                if (!posix_same_owner(request, fl))
-                 */
+                        break;
+                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->fl_type == fl->fl_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
-                                goto next_lock;
+                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
@@ -1029,18 +1032,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
-                                locks_delete_lock(before, &dispose);
+                                locks_delete_lock_ctx(fl, &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
-                }
+                } else {
-                else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
-                                goto next_lock;
+                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (request->fl_type == F_UNLCK)
@@ -1059,7 +1061,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                 * one (This may happen several times).
                                 */
                                if (added) {
-                                        locks_delete_lock(before, &dispose);
+                                        locks_delete_lock_ctx(fl, &dispose);
                                        continue;
                                }
                                /*
@@ -1075,15 +1077,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                locks_copy_lock(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
-                                locks_delete_lock(before, &dispose);
+                                locks_insert_lock_ctx(request, &fl->fl_list);
-                                locks_insert_lock(before, request);
+                                locks_delete_lock_ctx(fl, &dispose);
                                added = true;
                        }
                }
-                /* Go on to next lock.
-                 */
-        next_lock:
-                before = &fl->fl_next;
        }
        /*
@@ -1108,7 +1106,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        goto out;
                }
                locks_copy_lock(new_fl, request);
-                locks_insert_lock(before, new_fl);
+                locks_insert_lock_ctx(new_fl, &fl->fl_list);
+                fl = new_fl;
                new_fl = NULL;
        }
        if (right) {
@@ -1119,7 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
-                        locks_insert_lock(before, left);
+                        locks_insert_lock_ctx(left, &fl->fl_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(right);
@@ -1129,7 +1128,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                locks_wake_up_blocks(left);
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        /*
         * Free any unused locks.
         */
@@ -1199,22 +1198,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
 */
 int locks_mandatory_locked(struct file *file)
 {
+        int ret;
        struct inode *inode = file_inode(file);
+        struct file_lock_context *ctx;
        struct file_lock *fl;
+        ctx = inode->i_flctx;
+        if (!ctx || list_empty_careful(&ctx->flc_posix))
+                return 0;
        /*
         * Search the lock list for this inode for any POSIX locks.
         */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+        ret = 0;
-                if (!IS_POSIX(fl))
+        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-                        continue;
                if (fl->fl_owner != current->files &&
-                    fl->fl_owner != file)
+                    fl->fl_owner != file) {
+                        ret = -EAGAIN;
                        break;
+                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
-        return fl ? -EAGAIN : 0;
+        return ret;
 }
 /**
@@ -1294,9 +1300,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 }
 /* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
+int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
-        struct file_lock *fl = *before;
        int error = assign_type(fl, arg);
        if (error)
@@ -1313,7 +1318,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
-                locks_delete_lock(before, dispose);
+                locks_delete_lock_ctx(fl, dispose);
        }
        return 0;
 }
@@ -1329,25 +1334,24 @@ static bool past_time(unsigned long then)
 static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
-        struct file_lock **before;
+        struct file_lock_context *ctx = inode->i_flctx;
-        struct file_lock *fl;
+        struct file_lock *fl, *tmp;
-        lockdep_assert_held(&inode->i_lock);
+        lockdep_assert_held(&ctx->flc_lock);
-        before = &inode->i_flock;
+        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
-        while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
-                        lease_modify(before, F_RDLCK, dispose);
+                        lease_modify(fl, F_RDLCK, dispose);
                if (past_time(fl->fl_break_time))
-                        lease_modify(before, F_UNLCK, dispose);
+                        lease_modify(fl, F_UNLCK, dispose);
-                if (fl == *before)      /* lease_modify may have freed fl */
-                        before = &fl->fl_next;
        }
 }
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
+        if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+                return false;
        if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
                return false;
        return locks_conflict(breaker, lease);
@@ -1356,11 +1360,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 static bool
 any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 {
+        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock *fl;
-        lockdep_assert_held(&inode->i_lock);
+        lockdep_assert_held(&ctx->flc_lock);
-        for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
                if (leases_conflict(fl, breaker))
                        return true;
        }
@@ -1384,7 +1389,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
        int error = 0;
        struct file_lock *new_fl;
-        struct file_lock *fl, **before;
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);
@@ -1394,7 +1400,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                return PTR_ERR(new_fl);
        new_fl->fl_flags = type;
-        spin_lock(&inode->i_lock);
+        /* typically we will check that ctx is non-NULL before calling */
+        if (!ctx) {
+                WARN_ON_ONCE(1);
+                return error;
+        }
+        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
@@ -1408,9 +1420,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        break_time++;   /* so that 0 means no break time */
        }
-        for (before = &inode->i_flock;
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
-                        before = &fl->fl_next) {
                if (!leases_conflict(fl, new_fl))
                        continue;
                if (want_write) {
@@ -1419,17 +1429,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        fl->fl_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
-                        if (lease_breaking(inode->i_flock))
+                        if (lease_breaking(fl))
                                continue;
                        fl->fl_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
-                        locks_delete_lock(before, &dispose);
+                        locks_delete_lock_ctx(fl, &dispose);
        }
-        fl = inode->i_flock;
+        if (list_empty(&ctx->flc_lease))
-        if (!fl || !IS_LEASE(fl))
                goto out;
        if (mode & O_NONBLOCK) {
@@ -1439,18 +1448,19 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
        }
 restart:
-        break_time = inode->i_flock->fl_break_time;
+        fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+        break_time = fl->fl_break_time;
        if (break_time != 0)
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
-        locks_insert_block(inode->i_flock, new_fl);
+        locks_insert_block(fl, new_fl);
        trace_break_lease_block(inode, new_fl);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
                                                !new_fl->fl_next, break_time);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        locks_delete_block(new_fl);
        if (error >= 0) {
@@ -1462,12 +1472,10 @@ restart:
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        locks_free_lock(new_fl);
        return error;
@@ -1487,14 +1495,18 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
        bool has_lease = false;
-        struct file_lock *flock;
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl;
-        if (inode->i_flock) {
+        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-                spin_lock(&inode->i_lock);
+                spin_lock(&ctx->flc_lock);
-                flock = inode->i_flock;
+                if (!list_empty(&ctx->flc_lease)) {
-                if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
+                        fl = list_first_entry(&ctx->flc_lease,
-                        has_lease = true;
+                                                struct file_lock, fl_list);
-                spin_unlock(&inode->i_lock);
+                        if (fl->fl_type == F_WRLCK)
+                                has_lease = true;
+                }
+                spin_unlock(&ctx->flc_lock);
        }
        if (has_lease)
@@ -1532,20 +1544,22 @@ int fcntl_getlease(struct file *filp)
 {
        struct file_lock *fl;
        struct inode *inode = file_inode(filp);
+        struct file_lock_context *ctx = inode->i_flctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);
-        spin_lock(&inode->i_lock);
+        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-        time_out_leases(file_inode(filp), &dispose);
+                spin_lock(&ctx->flc_lock);
-        for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
+                time_out_leases(file_inode(filp), &dispose);
-                        fl = fl->fl_next) {
+                list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                if (fl->fl_file == filp) {
+                        if (fl->fl_file != filp)
+                                continue;
                        type = target_leasetype(fl);
                        break;
                }
+                spin_unlock(&ctx->flc_lock);
+                locks_dispose_list(&dispose);
        }
-        spin_unlock(&inode->i_lock);
-        locks_dispose_list(&dispose);
        return type;
 }
@@ -1560,11 +1574,14 @@ int fcntl_getlease(struct file *filp)
 * conflict with the lease we're trying to set.
 */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 {
        int ret = 0;
        struct inode *inode = dentry->d_inode;
+        if (flags & FL_LAYOUT)
+                return 0;
        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                return -EAGAIN;
@@ -1578,9 +1595,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
-        struct file_lock *fl, **before, **my_before = NULL, *lease;
+        struct file_lock *fl, *my_fl = NULL, *lease;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->fl_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);
@@ -1588,6 +1606,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
        lease = *flp;
        trace_generic_add_lease(inode, lease);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_mutex.  We trylock
@@ -1606,9 +1628,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
                return -EINVAL;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
-        error = check_conflicting_open(dentry, arg);
+        error = check_conflicting_open(dentry, arg, lease->fl_flags);
        if (error)
                goto out;
@@ -1621,13 +1643,13 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
         * except for this filp.
         */
        error = -EAGAIN;
-        for (before = &inode->i_flock;
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
+                if (fl->fl_file == filp &&
-                        before = &fl->fl_next) {
+                    fl->fl_owner == lease->fl_owner) {
-                if (fl->fl_file == filp) {
+                        my_fl = fl;
-                        my_before = before;
                        continue;
                }
                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
@@ -1642,9 +1664,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
                        goto out;
        }
-        if (my_before != NULL) {
+        if (my_fl != NULL) {
-                lease = *my_before;
+                error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
-                error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
@@ -1654,7 +1675,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
        if (!leases_enable)
                goto out;
-        locks_insert_lock(before, lease);
+        locks_insert_lock_ctx(lease, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
@@ -1665,46 +1686,51 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
         * precedes these checks.
         */
        smp_mb();
-        error = check_conflicting_open(dentry, arg);
+        error = check_conflicting_open(dentry, arg, lease->fl_flags);
-        if (error)
+        if (error) {
-                goto out_unlink;
+                locks_unlink_lock_ctx(lease);
+                goto out;
+        }
 out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        if (is_deleg)
                mutex_unlock(&inode->i_mutex);
-        if (!error && !my_before)
+        if (!error && !my_fl)
                *flp = NULL;
        return error;
-out_unlink:
-        locks_unlink_lock(before);
-        goto out;
 }
-static int generic_delete_lease(struct file *filp)
+static int generic_delete_lease(struct file *filp, void *owner)
 {
        int error = -EAGAIN;
-        struct file_lock *fl, **before;
+        struct file_lock *fl, *victim = NULL;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        struct file_lock_context *ctx = inode->i_flctx;
        LIST_HEAD(dispose);
-        spin_lock(&inode->i_lock);
+        if (!ctx) {
-        time_out_leases(inode, &dispose);
+                trace_generic_delete_lease(inode, NULL);
-        for (before = &inode->i_flock;
+                return error;
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
+        }
-                        before = &fl->fl_next) {
-                if (fl->fl_file == filp)
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+                if (fl->fl_file == filp &&
+                    fl->fl_owner == owner) {
+                        victim = fl;
                        break;
+                }
        }
        trace_generic_delete_lease(inode, fl);
-        if (fl && IS_LEASE(fl))
+        if (victim)
-                error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
+                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        return error;
 }
@@ -1737,13 +1763,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
        switch (arg) {
        case F_UNLCK:
-                return generic_delete_lease(filp);
+                return generic_delete_lease(filp, *priv);
        case F_RDLCK:
        case F_WRLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }
                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
@@ -1816,7 +1843,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
        if (arg == F_UNLCK)
-                return vfs_setlease(filp, F_UNLCK, NULL, NULL);
+                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, arg);
 }
@@ -2171,7 +2198,7 @@ again:
         */
        /*
         * we need that spin_lock here - it prevents reordering between
-         * update of inode->i_flock and check for it done in close().
+         * update of i_flctx->flc_posix and check for it done in close().
         * rcu_read_lock() wouldn't do.
         */
        spin_lock(&current->files->file_lock);
@@ -2331,13 +2358,14 @@ out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
        struct file_lock lock;
+        struct file_lock_context *ctx = file_inode(filp)->i_flctx;
        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
-        if (!file_inode(filp)->i_flock)
+        if (!ctx || list_empty(&ctx->flc_posix))
                return;
        lock.fl_type = F_UNLCK;
@@ -2358,67 +2386,68 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 EXPORT_SYMBOL(locks_remove_posix);
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_flock(struct file *filp)
+{
+        struct file_lock fl = {
+                .fl_owner = filp,
+                .fl_pid = current->tgid,
+                .fl_file = filp,
+                .fl_flags = FL_FLOCK,
+                .fl_type = F_UNLCK,
+                .fl_end = OFFSET_MAX,
+        };
+        struct file_lock_context *flctx = file_inode(filp)->i_flctx;
+        if (list_empty(&flctx->flc_flock))
+                return;
+        if (filp->f_op->flock)
+                filp->f_op->flock(filp, F_SETLKW, &fl);
+        else
+                flock_lock_file(filp, &fl);
+        if (fl.fl_ops && fl.fl_ops->fl_release_private)
+                fl.fl_ops->fl_release_private(&fl);
+}
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_lease(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl, *tmp;
+        LIST_HEAD(dispose);
+        if (list_empty(&ctx->flc_lease))
+                return;
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
+                if (filp == fl->fl_file)
+                        lease_modify(fl, F_UNLCK, &dispose);
+        spin_unlock(&ctx->flc_lock);
+        locks_dispose_list(&dispose);
+}
 /*
 * This function is called on the last close of an open file.
 */
 void locks_remove_file(struct file *filp)
 {
-        struct inode * inode = file_inode(filp);
+        if (!file_inode(filp)->i_flctx)
-        struct file_lock *fl;
-        struct file_lock **before;
-        LIST_HEAD(dispose);
-        if (!inode->i_flock)
                return;
+        /* remove any OFD locks */
        locks_remove_posix(filp, filp);
-        if (filp->f_op->flock) {
+        /* remove flock locks */
-                struct file_lock fl = {
+        locks_remove_flock(filp);
-                        .fl_owner = filp,
-                        .fl_pid = current->tgid,
-                        .fl_file = filp,
-                        .fl_flags = FL_FLOCK,
-                        .fl_type = F_UNLCK,
-                        .fl_end = OFFSET_MAX,
-                };
-                filp->f_op->flock(filp, F_SETLKW, &fl);
-                if (fl.fl_ops && fl.fl_ops->fl_release_private)
-                        fl.fl_ops->fl_release_private(&fl);
-        }
-        spin_lock(&inode->i_lock);
-        before = &inode->i_flock;
-        while ((fl = *before) != NULL) {
+        /* remove any leases */
-                if (fl->fl_file == filp) {
+        locks_remove_lease(filp);
-                        if (IS_LEASE(fl)) {
-                                lease_modify(before, F_UNLCK, &dispose);
-                                continue;
-                        }
-                        /*
-                         * There's a leftover lock on the list of a type that
-                         * we didn't expect to see. Most likely a classic
-                         * POSIX lock that ended up not getting released
-                         * properly, or that raced onto the list somehow. Log
-                         * some info about it and then just remove it from
-                         * the list.
-                         */
-                        WARN(!IS_FLOCK(fl),
-                                "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
-                                MAJOR(inode->i_sb->s_dev),
-                                MINOR(inode->i_sb->s_dev), inode->i_ino,
-                                fl->fl_type, fl->fl_flags,
-                                fl->fl_start, fl->fl_end);
-                        locks_delete_lock(before, &dispose);
-                        continue;
-                }
-                before = &fl->fl_next;
-        }
-        spin_unlock(&inode->i_lock);
-        locks_dispose_list(&dispose);
 }
 /**
@@ -2621,6 +2650,9 @@ static int __init filelock_init(void)
 {
        int i;
+        flctx_cache = kmem_cache_create("file_lock_ctx",
+                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
diff --git a/fs/mount.h b/fs/mount.h
index 0ad6f760ce52..6a61c2b3e385 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -2,6 +2,7 @@
 #include <linux/seq_file.h>
 #include <linux/poll.h>
 #include <linux/ns_common.h>
+#include <linux/fs_pin.h>
 struct mnt_namespace {
        atomic_t                count;
@@ -62,7 +63,8 @@ struct mount {
        int mnt_group_id;               /* peer group identifier */
        int mnt_expiry_mark;            /* true if marked for expiry */
        struct hlist_head mnt_pins;
-        struct path mnt_ex_mountpoint;
+        struct fs_pin mnt_umount;
+        struct dentry *mnt_ex_mountpoint;
 };
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namei.c b/fs/namei.c
index bc35b02883bb..96ca11dea4a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -118,15 +118,6 @@
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
-void final_putname(struct filename *name)
-{
-        if (name->separate) {
-                __putname(name->name);
-                kfree(name);
-        } else {
-                __putname(name);
-        }
-}
 #define EMBEDDED_NAME_MAX       (PATH_MAX - sizeof(struct filename))
@@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
+        result->refcnt = 1;
        /*
         * First, try to embed the struct filename inside the names_cache
@@ -179,6 +171,7 @@ recopy:
                }
                result->name = kname;
                result->separate = true;
+                result->refcnt = 1;
                max = PATH_MAX;
                goto recopy;
        }
@@ -202,7 +195,7 @@ recopy:
        return result;
 error:
-        final_putname(result);
+        putname(result);
        return err;
 }
@@ -212,43 +205,56 @@ getname(const char __user * filename)
        return getname_flags(filename, 0, NULL);
 }
-/*
- * The "getname_kernel()" interface doesn't do pathnames longer
- * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
- */
 struct filename *
 getname_kernel(const char * filename)
 {
        struct filename *result;
-        char *kname;
+        int len = strlen(filename) + 1;
-        int len;
-        len = strlen(filename);
-        if (len >= EMBEDDED_NAME_MAX)
-                return ERR_PTR(-ENAMETOOLONG);
        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
-        kname = (char *)result + sizeof(*result);
+        if (len <= EMBEDDED_NAME_MAX) {
-        result->name = kname;
+                result->name = (char *)(result) + sizeof(*result);
+                result->separate = false;
+        } else if (len <= PATH_MAX) {
+                struct filename *tmp;
+                tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
+                if (unlikely(!tmp)) {
+                        __putname(result);
+                        return ERR_PTR(-ENOMEM);
+                }
+                tmp->name = (char *)result;
+                tmp->separate = true;
+                result = tmp;
+        } else {
+                __putname(result);
+                return ERR_PTR(-ENAMETOOLONG);
+        }
+        memcpy((char *)result->name, filename, len);
        result->uptr = NULL;
        result->aname = NULL;
-        result->separate = false;
+        result->refcnt = 1;
+        audit_getname(result);
-        strlcpy(kname, filename, EMBEDDED_NAME_MAX);
        return result;
 }
-#ifdef CONFIG_AUDITSYSCALL
 void putname(struct filename *name)
 {
-        if (unlikely(!audit_dummy_context()))
+        BUG_ON(name->refcnt <= 0);
-                return audit_putname(name);
-        final_putname(name);
+        if (--name->refcnt > 0)
+                return;
+        if (name->separate) {
+                __putname(name->name);
+                kfree(name);
+        } else
+                __putname(name);
 }
-#endif
 static int check_acl(struct inode *inode, int mask)
 {
@@ -2036,31 +2042,47 @@ static int filename_lookup(int dfd, struct filename *name,
 static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        struct filename filename = { .name = name };
+        struct filename *filename = getname_kernel(name);
+        int retval = PTR_ERR(filename);
-        return filename_lookup(dfd, &filename, flags, nd);
+        if (!IS_ERR(filename)) {
+                retval = filename_lookup(dfd, filename, flags, nd);
+                putname(filename);
+        }
+        return retval;
 }
 /* does lookup, returns the object with parent locked */
 struct dentry *kern_path_locked(const char *name, struct path *path)
 {
+        struct filename *filename = getname_kernel(name);
        struct nameidata nd;
        struct dentry *d;
-        int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
+        int err;
-        if (err)
-                return ERR_PTR(err);
+        if (IS_ERR(filename))
+                return ERR_CAST(filename);
+        err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
+        if (err) {
+                d = ERR_PTR(err);
+                goto out;
+        }
        if (nd.last_type != LAST_NORM) {
                path_put(&nd.path);
-                return ERR_PTR(-EINVAL);
+                d = ERR_PTR(-EINVAL);
+                goto out;
        }
        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
        d = __lookup_hash(&nd.last, nd.path.dentry, 0);
        if (IS_ERR(d)) {
                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
                path_put(&nd.path);
-                return d;
+                goto out;
        }
        *path = nd.path;
+out:
+        putname(filename);
        return d;
 }
@@ -2351,13 +2373,17 @@ static int
 filename_mountpoint(int dfd, struct filename *s, struct path *path,
                        unsigned int flags)
 {
-        int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+        int error;
+        if (IS_ERR(s))
+                return PTR_ERR(s);
+        error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
        if (unlikely(error == -ECHILD))
                error = path_mountpoint(dfd, s->name, path, flags);
        if (unlikely(error == -ESTALE))
                error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
        if (likely(!error))
                audit_inode(s, path->dentry, 0);
+        putname(s);
        return error;
 }
@@ -2379,21 +2405,14 @@ int
 user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
                        struct path *path)
 {
-        struct filename *s = getname(name);
+        return filename_mountpoint(dfd, getname(name), path, flags);
-        int error;
-        if (IS_ERR(s))
-                return PTR_ERR(s);
-        error = filename_mountpoint(dfd, s, path, flags);
-        putname(s);
-        return error;
 }
 int
 kern_path_mountpoint(int dfd, const char *name, struct path *path,
                        unsigned int flags)
 {
-        struct filename s = {.name = name};
+        return filename_mountpoint(dfd, getname_kernel(name), path, flags);
-        return filename_mountpoint(dfd, &s, path, flags);
 }
 EXPORT_SYMBOL(kern_path_mountpoint);
@@ -3273,7 +3292,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 {
        struct nameidata nd;
        struct file *file;
-        struct filename filename = { .name = name };
+        struct filename *filename;
        int flags = op->lookup_flags | LOOKUP_ROOT;
        nd.root.mnt = mnt;
@@ -3282,15 +3301,20 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);
-        file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
+        filename = getname_kernel(name);
+        if (unlikely(IS_ERR(filename)))
+                return ERR_CAST(filename);
+        file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
-                file = path_openat(-1, &filename, &nd, op, flags);
+                file = path_openat(-1, filename, &nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
-                file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
+                file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+        putname(filename);
        return file;
 }
-struct dentry *kern_path_create(int dfd, const char *pathname,
+static struct dentry *filename_create(int dfd, struct filename *name,
                                struct path *path, unsigned int lookup_flags)
 {
        struct dentry *dentry = ERR_PTR(-EEXIST);
@@ -3305,7 +3329,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
         */
        lookup_flags &= LOOKUP_REVAL;
-        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
+        error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
        if (error)
                return ERR_PTR(error);
@@ -3359,6 +3383,19 @@ out:
        path_put(&nd.path);
        return dentry;
 }
+struct dentry *kern_path_create(int dfd, const char *pathname,
+                                struct path *path, unsigned int lookup_flags)
+{
+        struct filename *filename = getname_kernel(pathname);
+        struct dentry *res;
+        if (IS_ERR(filename))
+                return ERR_CAST(filename);
+        res = filename_create(dfd, filename, path, lookup_flags);
+        putname(filename);
+        return res;
+}
 EXPORT_SYMBOL(kern_path_create);
 void done_path_create(struct path *path, struct dentry *dentry)
@@ -3377,7 +3414,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname,
        struct dentry *res;
        if (IS_ERR(tmp))
                return ERR_CAST(tmp);
-        res = kern_path_create(dfd, tmp->name, path, lookup_flags);
+        res = filename_create(dfd, tmp, path, lookup_flags);
        putname(tmp);
        return res;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e9681a0cf..72a286e0d33e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -190,6 +190,14 @@ unsigned int mnt_get_count(struct mount *mnt)
 #endif
 }
+static void drop_mountpoint(struct fs_pin *p)
+{
+        struct mount *m = container_of(p, struct mount, mnt_umount);
+        dput(m->mnt_ex_mountpoint);
+        pin_remove(p);
+        mntput(&m->mnt);
+}
 static struct mount *alloc_vfsmnt(const char *name)
 {
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -201,7 +209,7 @@ static struct mount *alloc_vfsmnt(const char *name)
                        goto out_free_cache;
                if (name) {
-                        mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+                        mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }
@@ -229,12 +237,13 @@ static struct mount *alloc_vfsmnt(const char *name)
 #ifdef CONFIG_FSNOTIFY
                INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
+                init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
        }
        return mnt;
 #ifdef CONFIG_SMP
 out_free_devname:
-        kfree(mnt->mnt_devname);
+        kfree_const(mnt->mnt_devname);
 #endif
 out_free_id:
        mnt_free_id(mnt);
@@ -568,7 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 static void free_vfsmnt(struct mount *mnt)
 {
-        kfree(mnt->mnt_devname);
+        kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
 #endif
@@ -1289,7 +1298,6 @@ static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static void namespace_unlock(void)
 {
-        struct mount *mnt;
        struct hlist_head head = unmounted;
        if (likely(hlist_empty(&head))) {
@@ -1299,23 +1307,11 @@ static void namespace_unlock(void)
        head.first->pprev = &head.first;
        INIT_HLIST_HEAD(&unmounted);
-        /* undo decrements we'd done in umount_tree() */
-        hlist_for_each_entry(mnt, &head, mnt_hash)
-                if (mnt->mnt_ex_mountpoint.mnt)
-                        mntget(mnt->mnt_ex_mountpoint.mnt);
        up_write(&namespace_sem);
        synchronize_rcu();
-        while (!hlist_empty(&head)) {
+        group_pin_kill(&head);
-                mnt = hlist_entry(head.first, struct mount, mnt_hash);
-                hlist_del_init(&mnt->mnt_hash);
-                if (mnt->mnt_ex_mountpoint.mnt)
-                        path_put(&mnt->mnt_ex_mountpoint);
-                mntput(&mnt->mnt);
-        }
 }
 static inline void namespace_lock(void)
@@ -1334,7 +1330,6 @@ void umount_tree(struct mount *mnt, int how)
 {
        HLIST_HEAD(tmp_list);
        struct mount *p;
-        struct mount *last = NULL;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                hlist_del_init_rcu(&p->mnt_hash);
@@ -1347,33 +1342,28 @@ void umount_tree(struct mount *mnt, int how)
        if (how)
                propagate_umount(&tmp_list);
-        hlist_for_each_entry(p, &tmp_list, mnt_hash) {
+        while (!hlist_empty(&tmp_list)) {
+                p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
+                hlist_del_init_rcu(&p->mnt_hash);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
                p->mnt_ns = NULL;
                if (how < 2)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+                pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
                if (mnt_has_parent(p)) {
                        hlist_del_init(&p->mnt_mp_list);
                        put_mountpoint(p->mnt_mp);
                        mnt_add_count(p->mnt_parent, -1);
-                        /* move the reference to mountpoint into ->mnt_ex_mountpoint */
+                        /* old mountpoint will be dropped when we can do that */
-                        p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
+                        p->mnt_ex_mountpoint = p->mnt_mountpoint;
-                        p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
                        p->mnt_mountpoint = p->mnt.mnt_root;
                        p->mnt_parent = p;
                        p->mnt_mp = NULL;
                }
                change_mnt_propagation(p, MS_PRIVATE);
-                last = p;
-        }
-        if (last) {
-                last->mnt_hash.next = unmounted.first;
-                if (unmounted.first)
-                        unmounted.first->pprev = &last->mnt_hash.next;
-                unmounted.first = tmp_list.first;
-                unmounted.first->pprev = &unmounted.first;
        }
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 008960101520..e7ca827d7694 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -77,6 +77,7 @@ static int ncp_hash_dentry(const struct dentry *, struct qstr *);
 static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
                unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
+static void ncp_d_prune(struct dentry *dentry);
 const struct dentry_operations ncp_dentry_operations =
 {
@@ -84,6 +85,7 @@ const struct dentry_operations ncp_dentry_operations =
        .d_hash         = ncp_hash_dentry,
        .d_compare      = ncp_compare_dentry,
        .d_delete       = ncp_delete_dentry,
+        .d_prune        = ncp_d_prune,
 };
 #define ncp_namespace(i)        (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
@@ -384,42 +386,6 @@ finished:
        return val;
 }
-static struct dentry *
-ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-        struct dentry *dent = dentry;
-        if (d_validate(dent, parent)) {
-                if (dent->d_name.len <= NCP_MAXPATHLEN &&
-                    (unsigned long)dent->d_fsdata == fpos) {
-                        if (!dent->d_inode) {
-                                dput(dent);
-                                dent = NULL;
-                        }
-                        return dent;
-                }
-                dput(dent);
-        }
-        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&parent->d_lock);
-        list_for_each_entry(dent, &parent->d_subdirs, d_child) {
-                if ((unsigned long)dent->d_fsdata == fpos) {
-                        if (dent->d_inode)
-                                dget(dent);
-                        else
-                                dent = NULL;
-                        spin_unlock(&parent->d_lock);
-                        goto out;
-                }
-        }
-        spin_unlock(&parent->d_lock);
-        return NULL;
-out:
-        return dent;
-}
 static time_t ncp_obtain_mtime(struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -435,6 +401,20 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
        return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
 }
+static inline void
+ncp_invalidate_dircache_entries(struct dentry *parent)
+{
+        struct ncp_server *server = NCP_SERVER(parent->d_inode);
+        struct dentry *dentry;
+        spin_lock(&parent->d_lock);
+        list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
+                dentry->d_fsdata = NULL;
+                ncp_age_dentry(server, dentry);
+        }
+        spin_unlock(&parent->d_lock);
+}
 static int ncp_readdir(struct file *file, struct dir_context *ctx)
 {
        struct dentry *dentry = file->f_path.dentry;
@@ -500,10 +480,21 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
                        struct dentry *dent;
                        bool over;
-                        dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
+                        spin_lock(&dentry->d_lock);
-                                                dentry, ctx->pos);
+                        if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) { 
-                        if (!dent)
+                                spin_unlock(&dentry->d_lock);
+                                goto invalid_cache;
+                        }
+                        dent = ctl.cache->dentry[ctl.idx];
+                        if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) {
+                                spin_unlock(&dentry->d_lock);
+                                goto invalid_cache;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                        if (!dent->d_inode) {
+                                dput(dent);
                                goto invalid_cache;
+                        }
                        over = !dir_emit(ctx, dent->d_name.name,
                                        dent->d_name.len,
                                        dent->d_inode->i_ino, DT_UNKNOWN);
@@ -548,6 +539,9 @@ init_cache:
        ctl.filled = 0;
        ctl.valid  = 1;
 read_really:
+        spin_lock(&dentry->d_lock);
+        NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE;
+        spin_unlock(&dentry->d_lock);
        if (ncp_is_server_root(inode)) {
                ncp_read_volume_list(file, ctx, &ctl);
        } else {
@@ -573,6 +567,13 @@ out:
        return result;
 }
+static void ncp_d_prune(struct dentry *dentry)
+{
+        if (!dentry->d_fsdata)  /* not referenced from page cache */
+                return;
+        NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE;
+}
 static int
 ncp_fill_cache(struct file *file, struct dir_context *ctx,
                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
@@ -630,6 +631,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                        d_instantiate(newdent, inode);
                        if (!hashed)
                                d_rehash(newdent);
+                } else {
+                        spin_lock(&dentry->d_lock);
+                        NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+                        spin_unlock(&dentry->d_lock);
                }
        } else {
                struct inode *inode = newdent->d_inode;
@@ -639,12 +644,6 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                mutex_unlock(&inode->i_mutex);
        }
-        if (newdent->d_inode) {
-                ino = newdent->d_inode->i_ino;
-                newdent->d_fsdata = (void *) ctl.fpos;
-                ncp_new_dentry(newdent);
-        }
        if (ctl.idx >= NCP_DIRCACHE_SIZE) {
                if (ctl.page) {
                        kunmap(ctl.page);
@@ -660,8 +659,13 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                        ctl.cache = kmap(ctl.page);
        }
        if (ctl.cache) {
-                ctl.cache->dentry[ctl.idx] = newdent;
+                if (newdent->d_inode) {
-                valid = 1;
+                        newdent->d_fsdata = newdent;
+                        ctl.cache->dentry[ctl.idx] = newdent;
+                        ino = newdent->d_inode->i_ino;
+                        ncp_new_dentry(newdent);
+                }
+                valid = 1;
        }
        dput(newdent);
 end_advance:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
        if (inode) {
                atomic_set(&NCP_FINFO(inode)->opened, info->opened);
-                inode->i_mapping->backing_dev_info = sb->s_bdi;
                inode->i_ino = info->ino;
                ncp_set_attr(inode, info);
                if (S_ISREG(inode->i_mode)) {
@@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        server = NCP_SBP(sb);
        memset(server, 0, sizeof(*server));
-        error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+        error = bdi_setup_and_register(&server->bdi, "ncpfs");
        if (error)
                goto out_fput;
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
index 4b0bec477846..c4794504f843 100644
--- a/fs/ncpfs/ncp_fs_i.h
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -22,6 +22,7 @@ struct ncp_inode_info {
        int     access;
        int     flags;
 #define NCPI_KLUDGE_SYMLINK     0x0001
+#define NCPI_DIR_CACHE          0x0002
        __u8    file_handle[6];
        struct inode vfs_inode;
 };
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index b785f74bfe3c..250e443a07f3 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -184,36 +184,6 @@ ncp_new_dentry(struct dentry* dentry)
        dentry->d_time = jiffies;
 }
-static inline void
-ncp_renew_dentries(struct dentry *parent)
-{
-        struct ncp_server *server = NCP_SERVER(parent->d_inode);
-        struct dentry *dentry;
-        spin_lock(&parent->d_lock);
-        list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-                if (dentry->d_fsdata == NULL)
-                        ncp_age_dentry(server, dentry);
-                else
-                        ncp_new_dentry(dentry);
-        }
-        spin_unlock(&parent->d_lock);
-}
-static inline void
-ncp_invalidate_dircache_entries(struct dentry *parent)
-{
-        struct ncp_server *server = NCP_SERVER(parent->d_inode);
-        struct dentry *dentry;
-        spin_lock(&parent->d_lock);
-        list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-                dentry->d_fsdata = NULL;
-                ncp_age_dentry(server, dentry);
-        }
-        spin_unlock(&parent->d_lock);
-}
 struct ncp_cache_head {
        time_t          mtime;
        unsigned long   time;   /* cache age */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3dece03f2fc8..c7abc10279af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT
        depends on NFS_V4_1 && SCSI_OSD_ULD
        default NFS_V4
+config PNFS_FLEXFILE_LAYOUT
+        tristate
+        depends on NFS_V4_1 && NFS_V3
+        default m
 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
        string "NFSv4.1 Implementation ID Domain"
        depends on NFS_V4_1
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 04cb830fa09f..1e987acf20c9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -27,9 +27,10 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
          dns_resolve.o nfs4trace.o
 nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
 nfsv4-$(CONFIG_SYSCTL)  += nfs4sysctl.o
-nfsv4-$(CONFIG_NFS_V4_1)        += pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_1)        += pnfs.o pnfs_dev.o pnfs_nfs.o
 nfsv4-$(CONFIG_NFS_V4_2)        += nfs42proc.o
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
 obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 77fec6a55f57..1cac3c175d18 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -860,12 +860,14 @@ static const struct nfs_pageio_ops bl_pg_read_ops = {
        .pg_init = bl_pg_init_read,
        .pg_test = bl_pg_test_read,
        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static const struct nfs_pageio_ops bl_pg_write_ops = {
        .pg_init = bl_pg_init_write,
        .pg_test = bl_pg_test_write,
        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static struct pnfs_layoutdriver_type blocklayout_type = {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index b8fb3a4ef649..351be9205bf8 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp)
                if (try_to_freeze())
                        continue;
-                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
                spin_lock_bh(&serv->sv_cb_lock);
                if (!list_empty(&serv->sv_cb_list)) {
                        req = list_first_entry(&serv->sv_cb_list,
                                        struct rpc_rqst, rq_bc_list);
                        list_del(&req->rq_bc_list);
                        spin_unlock_bh(&serv->sv_cb_lock);
+                        finish_wait(&serv->sv_cb_waitq, &wq);
                        dprintk("Invoking bc_svc_process()\n");
                        error = bc_svc_process(serv, req, rqstp);
                        dprintk("bc_svc_process() returned w/ error code= %d\n",
                                error);
                } else {
                        spin_unlock_bh(&serv->sv_cb_lock);
-                        schedule();
+                        /* schedule_timeout to game the hung task watchdog */
+                        schedule_timeout(60 * HZ);
+                        finish_wait(&serv->sv_cb_waitq, &wq);
                }
-                finish_wait(&serv->sv_cb_waitq, &wq);
        }
        return 0;
 }
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f3f60641344..da5433230bb1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,25 +85,30 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 {
        struct inode *inode = state->inode;
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct list_head *list;
        int status = 0;
-        if (inode->i_flock == NULL)
+        if (flctx == NULL)
                goto out;
-        /* Protect inode->i_flock using the i_lock */
+        list = &flctx->flc_posix;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+restart:
-                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
+        list_for_each_entry(fl, list, fl_list) {
-                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&flctx->flc_lock);
                status = nfs4_lock_delegation_recall(fl, state, stateid);
                if (status < 0)
                        goto out;
-                spin_lock(&inode->i_lock);
+                spin_lock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
+        if (list == &flctx->flc_posix) {
+                list = &flctx->flc_flock;
+                goto restart;
+        }
+        spin_unlock(&flctx->flc_lock);
 out:
        return status;
 }
@@ -301,6 +306,17 @@ nfs_inode_detach_delegation(struct inode *inode)
        return nfs_detach_delegation(nfsi, delegation, server);
 }
+static void
+nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+                const struct nfs_delegation *update)
+{
+        if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
+                delegation->stateid.seqid = update->stateid.seqid;
+                smp_wmb();
+                delegation->type = update->type;
+        }
+}
 /**
 * nfs_inode_set_delegation - set up a delegation on an inode
 * @inode: inode to which delegation applies
@@ -334,9 +350,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        old_delegation = rcu_dereference_protected(nfsi->delegation,
                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
-                if (nfs4_stateid_match(&delegation->stateid,
+                /* Is this an update of the existing delegation? */
-                                        &old_delegation->stateid) &&
+                if (nfs4_stateid_match_other(&old_delegation->stateid,
-                                delegation->type == old_delegation->type) {
+                                        &delegation->stateid)) {
+                        nfs_update_inplace_delegation(old_delegation,
+                                        delegation);
+                        nfsi->delegation_state = old_delegation->type;
                        goto out;
                }
                /*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 294692ff83b1..7077521acdf4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -66,6 +66,10 @@ static struct kmem_cache *nfs_direct_cachep;
 /*
 * This represents a set of asynchronous requests that we're waiting on
 */
+struct nfs_direct_mirror {
+        ssize_t count;
+};
 struct nfs_direct_req {
        struct kref             kref;           /* release manager */
@@ -78,8 +82,13 @@ struct nfs_direct_req {
        /* completion state */
        atomic_t                io_count;       /* i/os we're waiting for */
        spinlock_t              lock;           /* protect completion state */
+        struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
+        int                     mirror_count;
        ssize_t                 count,          /* bytes actually processed */
                                bytes_left,     /* bytes left to be sent */
+                                io_start,       /* start of IO */
                                error;          /* any reported error */
        struct completion       completion;     /* wait for i/o completion */
@@ -108,26 +117,56 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
        return atomic_dec_and_test(&dreq->io_count);
 }
+void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
+{
+        dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+}
+EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
+static void
+nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+{
+        int i;
+        ssize_t count;
+        WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
+        count = dreq->mirrors[hdr->pgio_mirror_idx].count;
+        if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
+                count = hdr->io_start + hdr->good_bytes - dreq->io_start;
+                dreq->mirrors[hdr->pgio_mirror_idx].count = count;
+        }
+        /* update the dreq->count by finding the minimum agreed count from all
+         * mirrors */
+        count = dreq->mirrors[0].count;
+        for (i = 1; i < dreq->mirror_count; i++)
+                count = min(count, dreq->mirrors[i].count);
+        dreq->count = count;
+}
 /*
 * nfs_direct_select_verf - select the right verifier
 * @dreq - direct request possibly spanning multiple servers
 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ * @commit_idx - commit bucket index for the DS
 *
 * returns the correct verifier to use given the role of the server
 */
 static struct nfs_writeverf *
 nfs_direct_select_verf(struct nfs_direct_req *dreq,
                       struct nfs_client *ds_clp,
-                       int ds_idx)
+                       int commit_idx)
 {
        struct nfs_writeverf *verfp = &dreq->verf;
 #ifdef CONFIG_NFS_V4_1
        if (ds_clp) {
                /* pNFS is in use, use the DS verf */
-                if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
+                if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
-                        verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+                        verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
                else
                        WARN_ON_ONCE(1);
        }
@@ -148,8 +187,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
-        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
-                                      hdr->ds_idx);
        WARN_ON_ONCE(verfp->committed >= 0);
        memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
        WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +207,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
-        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
-                                         hdr->ds_idx);
        if (verfp->committed < 0) {
                nfs_direct_set_hdr_verf(dreq, hdr);
                return 0;
@@ -193,7 +230,11 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
        verfp = nfs_direct_select_verf(dreq, data->ds_clp,
                                         data->ds_commit_index);
-        WARN_ON_ONCE(verfp->committed < 0);
+        /* verifier not set so always fail */
+        if (verfp->committed < 0)
+                return 1;
        return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
 }
@@ -249,6 +290,18 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
        cinfo->completion_ops = &nfs_direct_commit_completion_ops;
 }
+static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
+                                             struct nfs_pageio_descriptor *pgio,
+                                             struct nfs_page *req)
+{
+        int mirror_count = 1;
+        if (pgio->pg_ops->pg_get_mirror_count)
+                mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+        dreq->mirror_count = mirror_count;
+}
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
        struct nfs_direct_req *dreq;
@@ -263,6 +316,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
        INIT_LIST_HEAD(&dreq->mds_cinfo.list);
        dreq->verf.committed = NFS_INVALID_STABLE_HOW;  /* not set yet */
        INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
+        dreq->mirror_count = 1;
        spin_lock_init(&dreq->lock);
        return dreq;
@@ -369,7 +423,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
        if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
                dreq->error = hdr->error;
        else
-                dreq->count += hdr->good_bytes;
+                nfs_direct_good_bytes(dreq, hdr);
        spin_unlock(&dreq->lock);
        while (!list_empty(&hdr->pages)) {
@@ -547,6 +602,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
        dreq->inode = inode;
        dreq->bytes_left = count;
+        dreq->io_start = pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
        if (IS_ERR(l_ctx)) {
@@ -579,6 +635,20 @@ out:
        return result;
 }
+static void
+nfs_direct_write_scan_commit_list(struct inode *inode,
+                                  struct list_head *list,
+                                  struct nfs_commit_info *cinfo)
+{
+        spin_lock(cinfo->lock);
+#ifdef CONFIG_NFS_V4_1
+        if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
+                NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
+#endif
+        nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
+        spin_unlock(cinfo->lock);
+}
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
        struct nfs_pageio_descriptor desc;
@@ -586,20 +656,23 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
        LIST_HEAD(reqs);
        struct nfs_commit_info cinfo;
        LIST_HEAD(failed);
+        int i;
        nfs_init_cinfo_from_dreq(&cinfo, dreq);
-        pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo);
+        nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
-        spin_lock(cinfo.lock);
-        nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
-        spin_unlock(cinfo.lock);
        dreq->count = 0;
+        for (i = 0; i < dreq->mirror_count; i++)
+                dreq->mirrors[i].count = 0;
        get_dreq(dreq);
        nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
                              &nfs_direct_write_completion_ops);
        desc.pg_dreq = dreq;
+        req = nfs_list_entry(reqs.next);
+        nfs_direct_setup_mirroring(dreq, &desc, req);
        list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
                if (!nfs_pageio_add_request(&desc, req)) {
                        nfs_list_remove_request(req);
@@ -646,7 +719,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
                nfs_list_remove_request(req);
                if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
                        /* Note the rewrite will go through mds */
-                        nfs_mark_request_commit(req, NULL, &cinfo);
+                        nfs_mark_request_commit(req, NULL, &cinfo, 0);
                } else
                        nfs_release_request(req);
                nfs_unlock_and_release_request(req);
@@ -721,7 +794,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                dreq->error = hdr->error;
        }
        if (dreq->error == 0) {
-                dreq->count += hdr->good_bytes;
+                nfs_direct_good_bytes(dreq, hdr);
                if (nfs_write_need_commit(hdr)) {
                        if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
                                request_commit = true;
@@ -745,7 +818,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                nfs_list_remove_request(req);
                if (request_commit) {
                        kref_get(&req->wb_kref);
-                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+                        nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+                                hdr->ds_commit_idx);
                }
                nfs_unlock_and_release_request(req);
        }
@@ -826,6 +900,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
                                result = PTR_ERR(req);
                                break;
                        }
+                        nfs_direct_setup_mirroring(dreq, &desc, req);
                        nfs_lock_request(req);
                        req->wb_index = pos >> PAGE_SHIFT;
                        req->wb_offset = pos & ~PAGE_MASK;
@@ -934,6 +1011,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
        dreq->inode = inode;
        dreq->bytes_left = count;
+        dreq->io_start = pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
        if (IS_ERR(l_ctx)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = nfs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..7ae1c263c5cf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -118,13 +118,6 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
        }
 }
-static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
-{
-        if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-                return;
-        pnfs_return_layout(inode);
-}
 static int filelayout_async_handle_error(struct rpc_task *task,
                                         struct nfs4_state *state,
                                         struct nfs_client *clp,
@@ -207,7 +200,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
                dprintk("%s DS connection error %d\n", __func__,
                        task->tk_status);
                nfs4_mark_deviceid_unavailable(devid);
-                set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+                pnfs_error_mark_layout_for_return(inode, lseg);
                rpc_wake_up(&tbl->slot_tbl_waitq);
                /* fall through */
        default:
@@ -339,16 +332,6 @@ static void filelayout_read_count_stats(struct rpc_task *task, void *data)
        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
-static void filelayout_read_release(void *data)
-{
-        struct nfs_pgio_header *hdr = data;
-        struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-        filelayout_fenceme(lo->plh_inode, lo);
-        nfs_put_client(hdr->ds_clp);
-        hdr->mds_ops->rpc_release(data);
-}
 static int filelayout_write_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
@@ -371,17 +354,6 @@ static int filelayout_write_done_cb(struct rpc_task *task,
        return 0;
 }
-/* Fake up some data that will cause nfs_commit_release to retry the writes. */
-static void prepare_to_resend_writes(struct nfs_commit_data *data)
-{
-        struct nfs_page *first = nfs_list_entry(data->pages.next);
-        data->task.tk_status = 0;
-        memcpy(&data->verf.verifier, &first->wb_verf,
-               sizeof(data->verf.verifier));
-        data->verf.verifier.data[0]++; /* ensure verifier mismatch */
-}
 static int filelayout_commit_done_cb(struct rpc_task *task,
                                     struct nfs_commit_data *data)
 {
@@ -393,7 +365,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-                prepare_to_resend_writes(data);
+                pnfs_generic_prepare_to_resend_writes(data);
                return -EAGAIN;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@ -451,16 +423,6 @@ static void filelayout_write_count_stats(struct rpc_task *task, void *data)
        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
-static void filelayout_write_release(void *data)
-{
-        struct nfs_pgio_header *hdr = data;
-        struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-        filelayout_fenceme(lo->plh_inode, lo);
-        nfs_put_client(hdr->ds_clp);
-        hdr->mds_ops->rpc_release(data);
-}
 static void filelayout_commit_prepare(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *wdata = data;
@@ -471,14 +433,6 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
                        task);
 }
-static void filelayout_write_commit_done(struct rpc_task *task, void *data)
-{
-        struct nfs_commit_data *wdata = data;
-        /* Note this may cause RPC to be resent */
-        wdata->mds_ops->rpc_call_done(task, data);
-}
 static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *cdata = data;
@@ -486,35 +440,25 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
        rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
 }
-static void filelayout_commit_release(void *calldata)
-{
-        struct nfs_commit_data *data = calldata;
-        data->completion_ops->completion(data);
-        pnfs_put_lseg(data->lseg);
-        nfs_put_client(data->ds_clp);
-        nfs_commitdata_release(data);
-}
 static const struct rpc_call_ops filelayout_read_call_ops = {
        .rpc_call_prepare = filelayout_read_prepare,
        .rpc_call_done = filelayout_read_call_done,
        .rpc_count_stats = filelayout_read_count_stats,
-        .rpc_release = filelayout_read_release,
+        .rpc_release = pnfs_generic_rw_release,
 };
 static const struct rpc_call_ops filelayout_write_call_ops = {
        .rpc_call_prepare = filelayout_write_prepare,
        .rpc_call_done = filelayout_write_call_done,
        .rpc_count_stats = filelayout_write_count_stats,
-        .rpc_release = filelayout_write_release,
+        .rpc_release = pnfs_generic_rw_release,
 };
 static const struct rpc_call_ops filelayout_commit_call_ops = {
        .rpc_call_prepare = filelayout_commit_prepare,
-        .rpc_call_done = filelayout_write_commit_done,
+        .rpc_call_done = pnfs_generic_write_commit_done,
        .rpc_count_stats = filelayout_commit_count_stats,
-        .rpc_release = filelayout_commit_release,
+        .rpc_release = pnfs_generic_commit_release,
 };
 static enum pnfs_try_status
@@ -548,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
        /* No multipath support. Use first DS */
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
-        hdr->ds_idx = idx;
+        hdr->ds_commit_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
                hdr->args.fh = fh;
@@ -557,8 +501,9 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
        hdr->mds_offset = offset;
        /* Perform an asynchronous read to ds */
-        nfs_initiate_pgio(ds_clnt, hdr,
+        nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
-                            &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
+                          NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
+                          0, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
 }
@@ -591,16 +536,16 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
        hdr->pgio_done_cb = filelayout_write_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
-        hdr->ds_idx = idx;
+        hdr->ds_commit_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
                hdr->args.fh = fh;
        hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
        /* Perform an asynchronous write */
-        nfs_initiate_pgio(ds_clnt, hdr,
+        nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
-                                    &filelayout_write_call_ops, sync,
+                          NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
-                                    RPC_TASK_SOFTCONN);
+                          sync, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
 }
@@ -988,12 +933,14 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = {
        .pg_init = filelayout_pg_init_read,
        .pg_test = filelayout_pg_test,
        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static const struct nfs_pageio_ops filelayout_pg_write_ops = {
        .pg_init = filelayout_pg_init_write,
        .pg_test = filelayout_pg_test,
        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
@@ -1004,37 +951,11 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
                return j;
 }
-/* The generic layer is about to remove the req from the commit list.
- * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this is must be called holding the inode (/cinfo) lock
- */
-static void
-filelayout_clear_request_commit(struct nfs_page *req,
-                                struct nfs_commit_info *cinfo)
-{
-        struct pnfs_layout_segment *freeme = NULL;
-        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
-                goto out;
-        cinfo->ds->nwritten--;
-        if (list_is_singular(&req->wb_list)) {
-                struct pnfs_commit_bucket *bucket;
-                bucket = list_first_entry(&req->wb_list,
-                                          struct pnfs_commit_bucket,
-                                          written);
-                freeme = bucket->wlseg;
-                bucket->wlseg = NULL;
-        }
-out:
-        nfs_request_remove_commit_list(req, cinfo);
-        pnfs_put_lseg_locked(freeme);
-}
 static void
 filelayout_mark_request_commit(struct nfs_page *req,
                               struct pnfs_layout_segment *lseg,
-                               struct nfs_commit_info *cinfo)
+                               struct nfs_commit_info *cinfo,
+                               u32 ds_commit_idx)
 {
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
@@ -1064,7 +985,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
                 * is normally transferred to the COMMIT call and released
                 * there.  It could also be released if the last req is pulled
                 * off due to a rewrite, in which case it will be done in
-                 * filelayout_clear_request_commit
+                 * pnfs_generic_clear_request_commit
                 */
                buckets[i].wlseg = pnfs_get_lseg(lseg);
        }
@@ -1081,7 +1002,7 @@ mds_commit:
        spin_unlock(cinfo->lock);
        if (!cinfo->dreq) {
                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+                inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
                             BDI_RECLAIMABLE);
                __mark_inode_dirty(req->wb_context->dentry->d_inode,
                                   I_DIRTY_DATASYNC);
@@ -1138,101 +1059,15 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
        if (fh)
                data->args.fh = fh;
-        return nfs_initiate_commit(ds_clnt, data,
+        return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
                                   &filelayout_commit_call_ops, how,
                                   RPC_TASK_SOFTCONN);
 out_err:
-        prepare_to_resend_writes(data);
+        pnfs_generic_prepare_to_resend_writes(data);
-        filelayout_commit_release(data);
+        pnfs_generic_commit_release(data);
        return -EAGAIN;
 }
-static int
-transfer_commit_list(struct list_head *src, struct list_head *dst,
-                     struct nfs_commit_info *cinfo, int max)
-{
-        struct nfs_page *req, *tmp;
-        int ret = 0;
-        list_for_each_entry_safe(req, tmp, src, wb_list) {
-                if (!nfs_lock_request(req))
-                        continue;
-                kref_get(&req->wb_kref);
-                if (cond_resched_lock(cinfo->lock))
-                        list_safe_reset_next(req, tmp, wb_list);
-                nfs_request_remove_commit_list(req, cinfo);
-                clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
-                nfs_list_add_request(req, dst);
-                ret++;
-                if ((ret == max) && !cinfo->dreq)
-                        break;
-        }
-        return ret;
-}
-/* Note called with cinfo->lock held. */
-static int
-filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
-                               struct nfs_commit_info *cinfo,
-                               int max)
-{
-        struct list_head *src = &bucket->written;
-        struct list_head *dst = &bucket->committing;
-        int ret;
-        ret = transfer_commit_list(src, dst, cinfo, max);
-        if (ret) {
-                cinfo->ds->nwritten -= ret;
-                cinfo->ds->ncommitting += ret;
-                bucket->clseg = bucket->wlseg;
-                if (list_empty(src))
-                        bucket->wlseg = NULL;
-                else
-                        pnfs_get_lseg(bucket->clseg);
-        }
-        return ret;
-}
-/* Move reqs from written to committing lists, returning count of number moved.
- * Note called with cinfo->lock held.
- */
-static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
-                                        int max)
-{
-        int i, rv = 0, cnt;
-        for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
-                cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
-                                                     cinfo, max);
-                max -= cnt;
-                rv += cnt;
-        }
-        return rv;
-}
-/* Pull everything off the committing lists and dump into @dst */
-static void filelayout_recover_commit_reqs(struct list_head *dst,
-                                           struct nfs_commit_info *cinfo)
-{
-        struct pnfs_commit_bucket *b;
-        struct pnfs_layout_segment *freeme;
-        int i;
-restart:
-        spin_lock(cinfo->lock);
-        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
-                if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
-                        freeme = b->wlseg;
-                        b->wlseg = NULL;
-                        spin_unlock(cinfo->lock);
-                        pnfs_put_lseg(freeme);
-                        goto restart;
-                }
-        }
-        cinfo->ds->nwritten = 0;
-        spin_unlock(cinfo->lock);
-}
 /* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
 *                                 for @page
 * @cinfo - commit info for current inode
@@ -1263,108 +1098,14 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
        return NULL;
 }
-static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
-{
-        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
-        struct pnfs_commit_bucket *bucket;
-        struct pnfs_layout_segment *freeme;
-        int i;
-        for (i = idx; i < fl_cinfo->nbuckets; i++) {
-                bucket = &fl_cinfo->buckets[i];
-                if (list_empty(&bucket->committing))
-                        continue;
-                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-                spin_lock(cinfo->lock);
-                freeme = bucket->clseg;
-                bucket->clseg = NULL;
-                spin_unlock(cinfo->lock);
-                pnfs_put_lseg(freeme);
-        }
-}
-static unsigned int
-alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
-{
-        struct pnfs_ds_commit_info *fl_cinfo;
-        struct pnfs_commit_bucket *bucket;
-        struct nfs_commit_data *data;
-        int i;
-        unsigned int nreq = 0;
-        fl_cinfo = cinfo->ds;
-        bucket = fl_cinfo->buckets;
-        for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
-                if (list_empty(&bucket->committing))
-                        continue;
-                data = nfs_commitdata_alloc();
-                if (!data)
-                        break;
-                data->ds_commit_index = i;
-                spin_lock(cinfo->lock);
-                data->lseg = bucket->clseg;
-                bucket->clseg = NULL;
-                spin_unlock(cinfo->lock);
-                list_add(&data->pages, list);
-                nreq++;
-        }
-        /* Clean up on error */
-        filelayout_retry_commit(cinfo, i);
-        /* Caller will clean up entries put on list */
-        return nreq;
-}
-/* This follows nfs_commit_list pretty closely */
 static int
 filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                           int how, struct nfs_commit_info *cinfo)
 {
-        struct nfs_commit_data *data, *tmp;
+        return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
-        LIST_HEAD(list);
+                                            filelayout_initiate_commit);
-        unsigned int nreq = 0;
-        if (!list_empty(mds_pages)) {
-                data = nfs_commitdata_alloc();
-                if (data != NULL) {
-                        data->lseg = NULL;
-                        list_add(&data->pages, &list);
-                        nreq++;
-                } else {
-                        nfs_retry_commit(mds_pages, NULL, cinfo);
-                        filelayout_retry_commit(cinfo, 0);
-                        cinfo->completion_ops->error_cleanup(NFS_I(inode));
-                        return -ENOMEM;
-                }
-        }
-        nreq += alloc_ds_commits(cinfo, &list);
-        if (nreq == 0) {
-                cinfo->completion_ops->error_cleanup(NFS_I(inode));
-                goto out;
-        }
-        atomic_add(nreq, &cinfo->mds->rpcs_out);
-        list_for_each_entry_safe(data, tmp, &list, pages) {
-                list_del_init(&data->pages);
-                if (!data->lseg) {
-                        nfs_init_commit(data, mds_pages, NULL, cinfo);
-                        nfs_initiate_commit(NFS_CLIENT(inode), data,
-                                            data->mds_ops, how, 0);
-                } else {
-                        struct pnfs_commit_bucket *buckets;
-                        buckets = cinfo->ds->buckets;
-                        nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
-                        filelayout_initiate_commit(data, how);
-                }
-        }
-out:
-        cinfo->ds->ncommitting = 0;
-        return PNFS_ATTEMPTED;
 }
 static struct nfs4_deviceid_node *
 filelayout_alloc_deviceid_node(struct nfs_server *server,
                struct pnfs_device *pdev, gfp_t gfp_flags)
@@ -1421,9 +1162,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .pg_write_ops           = &filelayout_pg_write_ops,
        .get_ds_info            = &filelayout_get_ds_info,
        .mark_request_commit    = filelayout_mark_request_commit,
-        .clear_request_commit   = filelayout_clear_request_commit,
+        .clear_request_commit   = pnfs_generic_clear_request_commit,
-        .scan_commit_lists      = filelayout_scan_commit_lists,
+        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
-        .recover_commit_reqs    = filelayout_recover_commit_reqs,
+        .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
        .search_commit_reqs     = filelayout_search_commit_reqs,
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index 7c9f800c49d7..2896cb833a11 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -33,13 +33,6 @@
 #include "../pnfs.h"
 /*
- * Default data server connection timeout and retrans vaules.
- * Set by module paramters dataserver_timeo and dataserver_retrans.
- */
-#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */
-#define NFS4_DEF_DS_RETRANS 5
-/*
 * Field testing shows we need to support up to 4096 stripe indices.
 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
 * reasonable. This in turn means we support a maximum of 256
@@ -48,32 +41,11 @@
 #define NFS4_PNFS_MAX_STRIPE_CNT 4096
 #define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
-/* error codes for internal use */
-#define NFS4ERR_RESET_TO_MDS   12001
 enum stripetype4 {
        STRIPE_SPARSE = 1,
        STRIPE_DENSE = 2
 };
-/* Individual ip address */
-struct nfs4_pnfs_ds_addr {
-        struct sockaddr_storage da_addr;
-        size_t                  da_addrlen;
-        struct list_head        da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
-        char                    *da_remotestr;  /* human readable addr+port */
-};
-struct nfs4_pnfs_ds {
-        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
-        char                    *ds_remotestr;  /* comma sep list of addrs */
-        struct list_head        ds_addrs;
-        struct nfs_client       *ds_clp;
-        atomic_t                ds_count;
-        unsigned long           ds_state;
-#define NFS4DS_CONNECTING       0       /* ds is establishing connection */
-};
 struct nfs4_file_layout_dsaddr {
        struct nfs4_deviceid_node       id_node;
        u32                             stripe_count;
@@ -119,17 +91,6 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
        return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
 }
-static inline void
-filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
-{
-        u32 *p = (u32 *)&node->deviceid;
-        printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
-                p[0], p[1], p[2], p[3]);
-        set_bit(NFS_DEVICEID_INVALID, &node->flags);
-}
 static inline bool
 filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
 {
@@ -142,7 +103,6 @@ filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
-extern void print_ds(struct nfs4_pnfs_ds *ds);
 u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index bfecac781f19..4f372e224603 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -31,7 +31,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
-#include <linux/sunrpc/addr.h>
 #include "../internal.h"
 #include "../nfs4session.h"
@@ -42,183 +41,6 @@
 static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
 static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
-/*
- * Data server cache
- *
- * Data servers can be mapped to different device ids.
- * nfs4_pnfs_ds reference counting
- *   - set to 1 on allocation
- *   - incremented when a device id maps a data server already in the cache.
- *   - decremented when deviceid is removed from the cache.
- */
-static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
-static LIST_HEAD(nfs4_data_server_cache);
-/* Debug routines */
-void
-print_ds(struct nfs4_pnfs_ds *ds)
-{
-        if (ds == NULL) {
-                printk("%s NULL device\n", __func__);
-                return;
-        }
-        printk("        ds %s\n"
-                "        ref count %d\n"
-                "        client %p\n"
-                "        cl_exchange_flags %x\n",
-                ds->ds_remotestr,
-                atomic_read(&ds->ds_count), ds->ds_clp,
-                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
-}
-static bool
-same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
-{
-        struct sockaddr_in *a, *b;
-        struct sockaddr_in6 *a6, *b6;
-        if (addr1->sa_family != addr2->sa_family)
-                return false;
-        switch (addr1->sa_family) {
-        case AF_INET:
-                a = (struct sockaddr_in *)addr1;
-                b = (struct sockaddr_in *)addr2;
-                if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
-                    a->sin_port == b->sin_port)
-                        return true;
-                break;
-        case AF_INET6:
-                a6 = (struct sockaddr_in6 *)addr1;
-                b6 = (struct sockaddr_in6 *)addr2;
-                /* LINKLOCAL addresses must have matching scope_id */
-                if (ipv6_addr_src_scope(&a6->sin6_addr) ==
-                    IPV6_ADDR_SCOPE_LINKLOCAL &&
-                    a6->sin6_scope_id != b6->sin6_scope_id)
-                        return false;
-                if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
-                    a6->sin6_port == b6->sin6_port)
-                        return true;
-                break;
-        default:
-                dprintk("%s: unhandled address family: %u\n",
-                        __func__, addr1->sa_family);
-                return false;
-        }
-        return false;
-}
-static bool
-_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
-                               const struct list_head *dsaddrs2)
-{
-        struct nfs4_pnfs_ds_addr *da1, *da2;
-        /* step through both lists, comparing as we go */
-        for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
-             da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
-             da1 != NULL && da2 != NULL;
-             da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
-             da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
-                if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
-                                   (struct sockaddr *)&da2->da_addr))
-                        return false;
-        }
-        if (da1 == NULL && da2 == NULL)
-                return true;
-        return false;
-}
-/*
- * Lookup DS by addresses.  nfs4_ds_cache_lock is held
- */
-static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(const struct list_head *dsaddrs)
-{
-        struct nfs4_pnfs_ds *ds;
-        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
-                if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
-                        return ds;
-        return NULL;
-}
-/*
- * Create an rpc connection to the nfs4_pnfs_ds data server
- * Currently only supports IPv4 and IPv6 addresses
- */
-static int
-nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
-{
-        struct nfs_client *clp = ERR_PTR(-EIO);
-        struct nfs4_pnfs_ds_addr *da;
-        int status = 0;
-        dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
-                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
-        list_for_each_entry(da, &ds->ds_addrs, da_node) {
-                dprintk("%s: DS %s: trying address %s\n",
-                        __func__, ds->ds_remotestr, da->da_remotestr);
-                clp = nfs4_set_ds_client(mds_srv->nfs_client,
-                                        (struct sockaddr *)&da->da_addr,
-                                        da->da_addrlen, IPPROTO_TCP,
-                                        dataserver_timeo, dataserver_retrans);
-                if (!IS_ERR(clp))
-                        break;
-        }
-        if (IS_ERR(clp)) {
-                status = PTR_ERR(clp);
-                goto out;
-        }
-        status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
-        if (status)
-                goto out_put;
-        smp_wmb();
-        ds->ds_clp = clp;
-        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
-out:
-        return status;
-out_put:
-        nfs_put_client(clp);
-        goto out;
-}
-static void
-destroy_ds(struct nfs4_pnfs_ds *ds)
-{
-        struct nfs4_pnfs_ds_addr *da;
-        dprintk("--> %s\n", __func__);
-        ifdebug(FACILITY)
-                print_ds(ds);
-        nfs_put_client(ds->ds_clp);
-        while (!list_empty(&ds->ds_addrs)) {
-                da = list_first_entry(&ds->ds_addrs,
-                                      struct nfs4_pnfs_ds_addr,
-                                      da_node);
-                list_del_init(&da->da_node);
-                kfree(da->da_remotestr);
-                kfree(da);
-        }
-        kfree(ds->ds_remotestr);
-        kfree(ds);
-}
 void
 nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
@@ -229,259 +51,13 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        for (i = 0; i < dsaddr->ds_num; i++) {
                ds = dsaddr->ds_list[i];
-                if (ds != NULL) {
+                if (ds != NULL)
-                        if (atomic_dec_and_lock(&ds->ds_count,
+                        nfs4_pnfs_ds_put(ds);
-                                                &nfs4_ds_cache_lock)) {
-                                list_del_init(&ds->ds_node);
-                                spin_unlock(&nfs4_ds_cache_lock);
-                                destroy_ds(ds);
-                        }
-                }
        }
        kfree(dsaddr->stripe_indices);
        kfree(dsaddr);
 }
-/*
- * Create a string with a human readable address and port to avoid
- * complicated setup around many dprinks.
- */
-static char *
-nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
-{
-        struct nfs4_pnfs_ds_addr *da;
-        char *remotestr;
-        size_t len;
-        char *p;
-        len = 3;        /* '{', '}' and eol */
-        list_for_each_entry(da, dsaddrs, da_node) {
-                len += strlen(da->da_remotestr) + 1;    /* string plus comma */
-        }
-        remotestr = kzalloc(len, gfp_flags);
-        if (!remotestr)
-                return NULL;
-        p = remotestr;
-        *(p++) = '{';
-        len--;
-        list_for_each_entry(da, dsaddrs, da_node) {
-                size_t ll = strlen(da->da_remotestr);
-                if (ll > len)
-                        goto out_err;
-                memcpy(p, da->da_remotestr, ll);
-                p += ll;
-                len -= ll;
-                if (len < 1)
-                        goto out_err;
-                (*p++) = ',';
-                len--;
-        }
-        if (len < 2)
-                goto out_err;
-        *(p++) = '}';
-        *p = '\0';
-        return remotestr;
-out_err:
-        kfree(remotestr);
-        return NULL;
-}
-static struct nfs4_pnfs_ds *
-nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
-{
-        struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
-        char *remotestr;
-        if (list_empty(dsaddrs)) {
-                dprintk("%s: no addresses defined\n", __func__);
-                goto out;
-        }
-        ds = kzalloc(sizeof(*ds), gfp_flags);
-        if (!ds)
-                goto out;
-        /* this is only used for debugging, so it's ok if its NULL */
-        remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
-        spin_lock(&nfs4_ds_cache_lock);
-        tmp_ds = _data_server_lookup_locked(dsaddrs);
-        if (tmp_ds == NULL) {
-                INIT_LIST_HEAD(&ds->ds_addrs);
-                list_splice_init(dsaddrs, &ds->ds_addrs);
-                ds->ds_remotestr = remotestr;
-                atomic_set(&ds->ds_count, 1);
-                INIT_LIST_HEAD(&ds->ds_node);
-                ds->ds_clp = NULL;
-                list_add(&ds->ds_node, &nfs4_data_server_cache);
-                dprintk("%s add new data server %s\n", __func__,
-                        ds->ds_remotestr);
-        } else {
-                kfree(remotestr);
-                kfree(ds);
-                atomic_inc(&tmp_ds->ds_count);
-                dprintk("%s data server %s found, inc'ed ds_count to %d\n",
-                        __func__, tmp_ds->ds_remotestr,
-                        atomic_read(&tmp_ds->ds_count));
-                ds = tmp_ds;
-        }
-        spin_unlock(&nfs4_ds_cache_lock);
-out:
-        return ds;
-}
-/*
- * Currently only supports ipv4, ipv6 and one multi-path address.
- */
-static struct nfs4_pnfs_ds_addr *
-decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
-{
-        struct nfs4_pnfs_ds_addr *da = NULL;
-        char *buf, *portstr;
-        __be16 port;
-        int nlen, rlen;
-        int tmp[2];
-        __be32 *p;
-        char *netid, *match_netid;
-        size_t len, match_netid_len;
-        char *startsep = "";
-        char *endsep = "";
-        /* r_netid */
-        p = xdr_inline_decode(streamp, 4);
-        if (unlikely(!p))
-                goto out_err;
-        nlen = be32_to_cpup(p++);
-        p = xdr_inline_decode(streamp, nlen);
-        if (unlikely(!p))
-                goto out_err;
-        netid = kmalloc(nlen+1, gfp_flags);
-        if (unlikely(!netid))
-                goto out_err;
-        netid[nlen] = '\0';
-        memcpy(netid, p, nlen);
-        /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
-        p = xdr_inline_decode(streamp, 4);
-        if (unlikely(!p))
-                goto out_free_netid;
-        rlen = be32_to_cpup(p);
-        p = xdr_inline_decode(streamp, rlen);
-        if (unlikely(!p))
-                goto out_free_netid;
-        /* port is ".ABC.DEF", 8 chars max */
-        if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
-                dprintk("%s: Invalid address, length %d\n", __func__,
-                        rlen);
-                goto out_free_netid;
-        }
-        buf = kmalloc(rlen + 1, gfp_flags);
-        if (!buf) {
-                dprintk("%s: Not enough memory\n", __func__);
-                goto out_free_netid;
-        }
-        buf[rlen] = '\0';
-        memcpy(buf, p, rlen);
-        /* replace port '.' with '-' */
-        portstr = strrchr(buf, '.');
-        if (!portstr) {
-                dprintk("%s: Failed finding expected dot in port\n",
-                        __func__);
-                goto out_free_buf;
-        }
-        *portstr = '-';
-        /* find '.' between address and port */
-        portstr = strrchr(buf, '.');
-        if (!portstr) {
-                dprintk("%s: Failed finding expected dot between address and "
-                        "port\n", __func__);
-                goto out_free_buf;
-        }
-        *portstr = '\0';
-        da = kzalloc(sizeof(*da), gfp_flags);
-        if (unlikely(!da))
-                goto out_free_buf;
-        INIT_LIST_HEAD(&da->da_node);
-        if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
-                      sizeof(da->da_addr))) {
-                dprintk("%s: error parsing address %s\n", __func__, buf);
-                goto out_free_da;
-        }
-        portstr++;
-        sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
-        port = htons((tmp[0] << 8) | (tmp[1]));
-        switch (da->da_addr.ss_family) {
-        case AF_INET:
-                ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
-                da->da_addrlen = sizeof(struct sockaddr_in);
-                match_netid = "tcp";
-                match_netid_len = 3;
-                break;
-        case AF_INET6:
-                ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
-                da->da_addrlen = sizeof(struct sockaddr_in6);
-                match_netid = "tcp6";
-                match_netid_len = 4;
-                startsep = "[";
-                endsep = "]";
-                break;
-        default:
-                dprintk("%s: unsupported address family: %u\n",
-                        __func__, da->da_addr.ss_family);
-                goto out_free_da;
-        }
-        if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
-                dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
-                        __func__, netid, match_netid);
-                goto out_free_da;
-        }
-        /* save human readable address */
-        len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
-        da->da_remotestr = kzalloc(len, gfp_flags);
-        /* NULL is ok, only used for dprintk */
-        if (da->da_remotestr)
-                snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
-                         buf, endsep, ntohs(port));
-        dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
-        kfree(buf);
-        kfree(netid);
-        return da;
-out_free_da:
-        kfree(da);
-out_free_buf:
-        dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
-        kfree(buf);
-out_free_netid:
-        kfree(netid);
-out_err:
-        return NULL;
-}
 /* Decode opaque device data and return the result */
 struct nfs4_file_layout_dsaddr *
 nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
@@ -584,8 +160,8 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
                mp_count = be32_to_cpup(p); /* multipath count */
                for (j = 0; j < mp_count; j++) {
-                        da = decode_ds_addr(server->nfs_client->cl_net,
+                        da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
-                                            &stream, gfp_flags);
+                                                    &stream, gfp_flags);
                        if (da)
                                list_add_tail(&da->da_node, &dsaddrs);
                }
@@ -681,22 +257,7 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
        return flseg->fh_array[i];
 }
-static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+/* Upon return, either ds is connected, or ds is NULL */
-{
-        might_sleep();
-        wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
-                           nfs_wait_bit_killable, TASK_KILLABLE);
-}
-static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
-{
-        smp_mb__before_atomic();
-        clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
-        smp_mb__after_atomic();
-        wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
-}
 struct nfs4_pnfs_ds *
 nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
@@ -704,29 +265,23 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
        struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
        struct nfs4_pnfs_ds *ret = ds;
+        struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
        if (ds == NULL) {
                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
                        __func__, ds_idx);
-                filelayout_mark_devid_invalid(devid);
+                pnfs_generic_mark_devid_invalid(devid);
                goto out;
        }
        smp_rmb();
        if (ds->ds_clp)
                goto out_test_devid;
-        if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
+        nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
-                struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+                             dataserver_retrans, 4,
-                int err;
+                             s->nfs_client->cl_minorversion,
+                             s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
-                err = nfs4_ds_connect(s, ds);
-                if (err)
-                        nfs4_mark_deviceid_unavailable(devid);
-                nfs4_clear_ds_conn_bit(ds);
-        } else {
-                /* Either ds is connected, or ds is NULL */
-                nfs4_wait_ds_connect(ds);
-        }
 out_test_devid:
        if (filelayout_test_devid_unavailable(devid))
                ret = NULL;
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000000..1d2c9f6bbcd4
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Flexfile Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
+nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000000..c22ecaa86c1c
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1574 @@
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_idmap.h>
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+        struct nfs4_flexfile_layout *ffl;
+        ffl = kzalloc(sizeof(*ffl), gfp_flags);
+        if (ffl) {
+                INIT_LIST_HEAD(&ffl->error_list);
+                return &ffl->generic_hdr;
+        } else
+                return NULL;
+}
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct nfs4_ff_layout_ds_err *err, *n;
+        list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
+                                 list) {
+                list_del(&err->list);
+                kfree(err);
+        }
+        kfree(FF_LAYOUT_FROM_HDR(lo));
+}
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+        if (unlikely(p == NULL))
+                return -ENOBUFS;
+        memcpy(stateid, p, NFS4_STATEID_SIZE);
+        dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+                p[0], p[1], p[2], p[3]);
+        return 0;
+}
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+        nfs4_print_deviceid(devid);
+        return 0;
+}
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        fh->size = be32_to_cpup(p++);
+        if (fh->size > sizeof(struct nfs_fh)) {
+                printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+                       fh->size);
+                return -EOVERFLOW;
+        }
+        /* fh.data */
+        p = xdr_inline_decode(xdr, fh->size);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        memcpy(&fh->data, p, fh->size);
+        dprintk("%s: fh len %d\n", __func__, fh->size);
+        return 0;
+}
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+        __be32 *p;
+        int len;
+        /* opaque_length(4)*/
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        len = be32_to_cpup(p++);
+        if (len < 0)
+                return -EINVAL;
+        dprintk("%s: len %u\n", __func__, len);
+        /* opaque body */
+        p = xdr_inline_decode(xdr, len);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        if (!nfs_map_string_to_numeric((char *)p, len, id))
+                return -EINVAL;
+        return 0;
+}
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+        int i;
+        if (fls->mirror_array) {
+                for (i = 0; i < fls->mirror_array_cnt; i++) {
+                        /* normally mirror_ds is freed in
+                         * .free_deviceid_node but we still do it here
+                         * for .alloc_lseg error path */
+                        if (fls->mirror_array[i]) {
+                                kfree(fls->mirror_array[i]->fh_versions);
+                                nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+                                kfree(fls->mirror_array[i]);
+                        }
+                }
+                kfree(fls->mirror_array);
+                fls->mirror_array = NULL;
+        }
+}
+static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
+{
+        int ret = 0;
+        dprintk("--> %s\n", __func__);
+        /* FIXME: remove this check when layout segment support is added */
+        if (lgr->range.offset != 0 ||
+            lgr->range.length != NFS4_MAX_UINT64) {
+                dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+                        __func__);
+                ret = -EINVAL;
+        }
+        dprintk("--> %s returns %d\n", __func__, ret);
+        return ret;
+}
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+        if (fls) {
+                ff_layout_free_mirror_array(fls);
+                kfree(fls);
+        }
+}
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+        struct nfs4_ff_layout_mirror *tmp;
+        int i, j;
+        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+                for (j = i + 1; j < fls->mirror_array_cnt; j++)
+                        if (fls->mirror_array[i]->efficiency <
+                            fls->mirror_array[j]->efficiency) {
+                                tmp = fls->mirror_array[i];
+                                fls->mirror_array[i] = fls->mirror_array[j];
+                                fls->mirror_array[j] = tmp;
+                        }
+        }
+}
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+                     struct nfs4_layoutget_res *lgr,
+                     gfp_t gfp_flags)
+{
+        struct pnfs_layout_segment *ret;
+        struct nfs4_ff_layout_segment *fls = NULL;
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        struct page *scratch;
+        u64 stripe_unit;
+        u32 mirror_array_cnt;
+        __be32 *p;
+        int i, rc;
+        dprintk("--> %s\n", __func__);
+        scratch = alloc_page(gfp_flags);
+        if (!scratch)
+                return ERR_PTR(-ENOMEM);
+        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+                              lgr->layoutp->len);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        /* stripe unit and mirror_array_cnt */
+        rc = -EIO;
+        p = xdr_inline_decode(&stream, 8 + 4);
+        if (!p)
+                goto out_err_free;
+        p = xdr_decode_hyper(p, &stripe_unit);
+        mirror_array_cnt = be32_to_cpup(p++);
+        dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+                stripe_unit, mirror_array_cnt);
+        if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+            mirror_array_cnt == 0)
+                goto out_err_free;
+        rc = -ENOMEM;
+        fls = kzalloc(sizeof(*fls), gfp_flags);
+        if (!fls)
+                goto out_err_free;
+        fls->mirror_array_cnt = mirror_array_cnt;
+        fls->stripe_unit = stripe_unit;
+        fls->mirror_array = kcalloc(fls->mirror_array_cnt,
+                                    sizeof(fls->mirror_array[0]), gfp_flags);
+        if (fls->mirror_array == NULL)
+                goto out_err_free;
+        for (i = 0; i < fls->mirror_array_cnt; i++) {
+                struct nfs4_deviceid devid;
+                struct nfs4_deviceid_node *idnode;
+                u32 ds_count;
+                u32 fh_count;
+                int j;
+                rc = -EIO;
+                p = xdr_inline_decode(&stream, 4);
+                if (!p)
+                        goto out_err_free;
+                ds_count = be32_to_cpup(p);
+                /* FIXME: allow for striping? */
+                if (ds_count != 1)
+                        goto out_err_free;
+                fls->mirror_array[i] =
+                        kzalloc(sizeof(struct nfs4_ff_layout_mirror),
+                                gfp_flags);
+                if (fls->mirror_array[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out_err_free;
+                }
+                spin_lock_init(&fls->mirror_array[i]->lock);
+                fls->mirror_array[i]->ds_count = ds_count;
+                /* deviceid */
+                rc = decode_deviceid(&stream, &devid);
+                if (rc)
+                        goto out_err_free;
+                idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+                                                &devid, lh->plh_lc_cred,
+                                                gfp_flags);
+                /*
+                 * upon success, mirror_ds is allocated by previous
+                 * getdeviceinfo, or newly by .alloc_deviceid_node
+                 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
+                 */
+                if (idnode)
+                        fls->mirror_array[i]->mirror_ds =
+                                FF_LAYOUT_MIRROR_DS(idnode);
+                else
+                        goto out_err_free;
+                /* efficiency */
+                rc = -EIO;
+                p = xdr_inline_decode(&stream, 4);
+                if (!p)
+                        goto out_err_free;
+                fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+                /* stateid */
+                rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+                if (rc)
+                        goto out_err_free;
+                /* fh */
+                p = xdr_inline_decode(&stream, 4);
+                if (!p)
+                        goto out_err_free;
+                fh_count = be32_to_cpup(p);
+                fls->mirror_array[i]->fh_versions =
+                        kzalloc(fh_count * sizeof(struct nfs_fh),
+                                gfp_flags);
+                if (fls->mirror_array[i]->fh_versions == NULL) {
+                        rc = -ENOMEM;
+                        goto out_err_free;
+                }
+                for (j = 0; j < fh_count; j++) {
+                        rc = decode_nfs_fh(&stream,
+                                           &fls->mirror_array[i]->fh_versions[j]);
+                        if (rc)
+                                goto out_err_free;
+                }
+                fls->mirror_array[i]->fh_versions_cnt = fh_count;
+                /* user */
+                rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+                if (rc)
+                        goto out_err_free;
+                /* group */
+                rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+                if (rc)
+                        goto out_err_free;
+                dprintk("%s: uid %d gid %d\n", __func__,
+                        fls->mirror_array[i]->uid,
+                        fls->mirror_array[i]->gid);
+        }
+        ff_layout_sort_mirrors(fls);
+        rc = ff_layout_check_layout(lgr);
+        if (rc)
+                goto out_err_free;
+        ret = &fls->generic_hdr;
+        dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+        __free_page(scratch);
+        return ret;
+out_err_free:
+        _ff_layout_free_lseg(fls);
+        ret = ERR_PTR(rc);
+        dprintk("<-- %s (%d)\n", __func__, rc);
+        goto out_free_page;
+}
+static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
+{
+        struct pnfs_layout_segment *lseg;
+        list_for_each_entry(lseg, &layout->plh_segs, pls_list)
+                if (lseg->pls_range.iomode == IOMODE_RW)
+                        return true;
+        return false;
+}
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+        int i;
+        dprintk("--> %s\n", __func__);
+        for (i = 0; i < fls->mirror_array_cnt; i++) {
+                if (fls->mirror_array[i]) {
+                        nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+                        fls->mirror_array[i]->mirror_ds = NULL;
+                        if (fls->mirror_array[i]->cred) {
+                                put_rpccred(fls->mirror_array[i]->cred);
+                                fls->mirror_array[i]->cred = NULL;
+                        }
+                }
+        }
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                struct nfs4_flexfile_layout *ffl;
+                struct inode *inode;
+                ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+                inode = ffl->generic_hdr.plh_inode;
+                spin_lock(&inode->i_lock);
+                if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
+                        ffl->commit_info.nbuckets = 0;
+                        kfree(ffl->commit_info.buckets);
+                        ffl->commit_info.buckets = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+        }
+        _ff_layout_free_lseg(fls);
+}
+/* Return 1 until we have multiple lsegs support */
+static int
+ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
+{
+        return 1;
+}
+static int
+ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+                            struct nfs_commit_info *cinfo,
+                            gfp_t gfp_flags)
+{
+        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+        struct pnfs_commit_bucket *buckets;
+        int size;
+        if (cinfo->ds->nbuckets != 0) {
+                /* This assumes there is only one RW lseg per file.
+                 * To support multiple lseg per file, we need to
+                 * change struct pnfs_commit_bucket to allow dynamic
+                 * increasing nbuckets.
+                 */
+                return 0;
+        }
+        size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
+        buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+                          gfp_flags);
+        if (!buckets)
+                return -ENOMEM;
+        else {
+                int i;
+                spin_lock(cinfo->lock);
+                if (cinfo->ds->nbuckets != 0)
+                        kfree(buckets);
+                else {
+                        cinfo->ds->buckets = buckets;
+                        cinfo->ds->nbuckets = size;
+                        for (i = 0; i < size; i++) {
+                                INIT_LIST_HEAD(&buckets[i].written);
+                                INIT_LIST_HEAD(&buckets[i].committing);
+                                /* mark direct verifier as unset */
+                                buckets[i].direct_verf.committed =
+                                        NFS_INVALID_STABLE_HOW;
+                        }
+                }
+                spin_unlock(cinfo->lock);
+                return 0;
+        }
+}
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+                                  int *best_idx)
+{
+        struct nfs4_ff_layout_segment *fls;
+        struct nfs4_pnfs_ds *ds;
+        int idx;
+        fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
+        /* mirrors are sorted by efficiency */
+        for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
+                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+                if (ds) {
+                        *best_idx = idx;
+                        return ds;
+                }
+        }
+        return NULL;
+}
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+                        struct nfs_page *req)
+{
+        struct nfs_pgio_mirror *pgm;
+        struct nfs4_ff_layout_mirror *mirror;
+        struct nfs4_pnfs_ds *ds;
+        int ds_idx;
+        /* Use full layout for now */
+        if (!pgio->pg_lseg)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
+                                                   0,
+                                                   NFS4_MAX_UINT64,
+                                                   IOMODE_READ,
+                                                   GFP_KERNEL);
+        /* If no lseg, fall back to read through mds */
+        if (pgio->pg_lseg == NULL)
+                goto out_mds;
+        ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+        if (!ds)
+                goto out_mds;
+        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+        pgio->pg_mirror_idx = ds_idx;
+        /* read always uses only one mirror - idx 0 for pgio layer */
+        pgm = &pgio->pg_mirrors[0];
+        pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+        return;
+out_mds:
+        pnfs_put_lseg(pgio->pg_lseg);
+        pgio->pg_lseg = NULL;
+        nfs_pageio_reset_read_mds(pgio);
+}
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+                        struct nfs_page *req)
+{
+        struct nfs4_ff_layout_mirror *mirror;
+        struct nfs_pgio_mirror *pgm;
+        struct nfs_commit_info cinfo;
+        struct nfs4_pnfs_ds *ds;
+        int i;
+        int status;
+        if (!pgio->pg_lseg)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
+                                                   0,
+                                                   NFS4_MAX_UINT64,
+                                                   IOMODE_RW,
+                                                   GFP_NOFS);
+        /* If no lseg, fall back to write through mds */
+        if (pgio->pg_lseg == NULL)
+                goto out_mds;
+        nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+        status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+        if (status < 0)
+                goto out_mds;
+        /* Use a direct mapping of ds_idx to pgio mirror_idx */
+        if (WARN_ON_ONCE(pgio->pg_mirror_count !=
+            FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
+                goto out_mds;
+        for (i = 0; i < pgio->pg_mirror_count; i++) {
+                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+                if (!ds)
+                        goto out_mds;
+                pgm = &pgio->pg_mirrors[i];
+                mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+                pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+        }
+        return;
+out_mds:
+        pnfs_put_lseg(pgio->pg_lseg);
+        pgio->pg_lseg = NULL;
+        nfs_pageio_reset_write_mds(pgio);
+}
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+                                    struct nfs_page *req)
+{
+        if (!pgio->pg_lseg)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
+                                                   0,
+                                                   NFS4_MAX_UINT64,
+                                                   IOMODE_RW,
+                                                   GFP_NOFS);
+        if (pgio->pg_lseg)
+                return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+        /* no lseg means that pnfs is not in use, so no mirroring here */
+        pnfs_put_lseg(pgio->pg_lseg);
+        pgio->pg_lseg = NULL;
+        nfs_pageio_reset_write_mds(pgio);
+        return 1;
+}
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+        .pg_init = ff_layout_pg_init_read,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+        .pg_init = ff_layout_pg_init_write,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+        struct rpc_task *task = &hdr->task;
+        pnfs_layoutcommit_inode(hdr->inode, false);
+        if (retry_pnfs) {
+                dprintk("%s Reset task %5u for i/o through pNFS "
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+                        hdr->task.tk_pid,
+                        hdr->inode->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(hdr->inode),
+                        hdr->args.count,
+                        (unsigned long long)hdr->args.offset);
+                if (!hdr->dreq) {
+                        struct nfs_open_context *ctx;
+                        ctx = nfs_list_entry(hdr->pages.next)->wb_context;
+                        set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+                        hdr->completion_ops->error_cleanup(&hdr->pages);
+                } else {
+                        nfs_direct_set_resched_writes(hdr->dreq);
+                        /* fake unstable write to let common nfs resend pages */
+                        hdr->verf.committed = NFS_UNSTABLE;
+                        hdr->good_bytes = 0;
+                }
+                return;
+        }
+        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+                dprintk("%s Reset task %5u for i/o through MDS "
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+                        hdr->task.tk_pid,
+                        hdr->inode->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(hdr->inode),
+                        hdr->args.count,
+                        (unsigned long long)hdr->args.offset);
+                task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+        }
+}
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+        struct rpc_task *task = &hdr->task;
+        pnfs_layoutcommit_inode(hdr->inode, false);
+        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+                dprintk("%s Reset task %5u for i/o through MDS "
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+                        hdr->task.tk_pid,
+                        hdr->inode->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(hdr->inode),
+                        hdr->args.count,
+                        (unsigned long long)hdr->args.offset);
+                task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+        }
+}
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+                                           struct nfs4_state *state,
+                                           struct nfs_client *clp,
+                                           struct pnfs_layout_segment *lseg,
+                                           int idx)
+{
+        struct pnfs_layout_hdr *lo = lseg->pls_layout;
+        struct inode *inode = lo->plh_inode;
+        struct nfs_server *mds_server = NFS_SERVER(inode);
+        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+        struct nfs_client *mds_client = mds_server->nfs_client;
+        struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+        if (task->tk_status >= 0)
+                return 0;
+        switch (task->tk_status) {
+        /* MDS state errors */
+        case -NFS4ERR_DELEG_REVOKED:
+        case -NFS4ERR_ADMIN_REVOKED:
+        case -NFS4ERR_BAD_STATEID:
+                if (state == NULL)
+                        break;
+                nfs_remove_bad_delegation(state->inode);
+        case -NFS4ERR_OPENMODE:
+                if (state == NULL)
+                        break;
+                if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+                        goto out_bad_stateid;
+                goto wait_on_recovery;
+        case -NFS4ERR_EXPIRED:
+                if (state != NULL) {
+                        if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+                                goto out_bad_stateid;
+                }
+                nfs4_schedule_lease_recovery(mds_client);
+                goto wait_on_recovery;
+        /* DS session errors */
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_BADSLOT:
+        case -NFS4ERR_BAD_HIGH_SLOT:
+        case -NFS4ERR_DEADSESSION:
+        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+        case -NFS4ERR_SEQ_FALSE_RETRY:
+        case -NFS4ERR_SEQ_MISORDERED:
+                dprintk("%s ERROR %d, Reset session. Exchangeid "
+                        "flags 0x%x\n", __func__, task->tk_status,
+                        clp->cl_exchange_flags);
+                nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+                break;
+        case -NFS4ERR_DELAY:
+        case -NFS4ERR_GRACE:
+                rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+                break;
+        case -NFS4ERR_RETRY_UNCACHED_REP:
+                break;
+        /* Invalidate Layout errors */
+        case -NFS4ERR_PNFS_NO_LAYOUT:
+        case -ESTALE:           /* mapped NFS4ERR_STALE */
+        case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+        case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+        case -NFS4ERR_FHEXPIRED:
+        case -NFS4ERR_WRONG_TYPE:
+                dprintk("%s Invalid layout error %d\n", __func__,
+                        task->tk_status);
+                /*
+                 * Destroy layout so new i/o will get a new layout.
+                 * Layout will not be destroyed until all current lseg
+                 * references are put. Mark layout as invalid to resend failed
+                 * i/o and all i/o waiting on the slot table to the MDS until
+                 * layout is destroyed and a new valid layout is obtained.
+                 */
+                pnfs_destroy_layout(NFS_I(inode));
+                rpc_wake_up(&tbl->slot_tbl_waitq);
+                goto reset;
+        /* RPC connection errors */
+        case -ECONNREFUSED:
+        case -EHOSTDOWN:
+        case -EHOSTUNREACH:
+        case -ENETUNREACH:
+        case -EIO:
+        case -ETIMEDOUT:
+        case -EPIPE:
+                dprintk("%s DS connection error %d\n", __func__,
+                        task->tk_status);
+                nfs4_mark_deviceid_unavailable(devid);
+                rpc_wake_up(&tbl->slot_tbl_waitq);
+                /* fall through */
+        default:
+                if (ff_layout_has_available_ds(lseg))
+                        return -NFS4ERR_RESET_TO_PNFS;
+reset:
+                dprintk("%s Retry through MDS. Error %d\n", __func__,
+                        task->tk_status);
+                return -NFS4ERR_RESET_TO_MDS;
+        }
+out:
+        task->tk_status = 0;
+        return -EAGAIN;
+out_bad_stateid:
+        task->tk_status = -EIO;
+        return 0;
+wait_on_recovery:
+        rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+                rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+        goto out;
+}
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+                                           struct pnfs_layout_segment *lseg,
+                                           int idx)
+{
+        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+        if (task->tk_status >= 0)
+                return 0;
+        if (task->tk_status != -EJUKEBOX) {
+                dprintk("%s DS connection error %d\n", __func__,
+                        task->tk_status);
+                nfs4_mark_deviceid_unavailable(devid);
+                if (ff_layout_has_available_ds(lseg))
+                        return -NFS4ERR_RESET_TO_PNFS;
+                else
+                        return -NFS4ERR_RESET_TO_MDS;
+        }
+        if (task->tk_status == -EJUKEBOX)
+                nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+        task->tk_status = 0;
+        rpc_restart_call(task);
+        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+        return -EAGAIN;
+}
+static int ff_layout_async_handle_error(struct rpc_task *task,
+                                        struct nfs4_state *state,
+                                        struct nfs_client *clp,
+                                        struct pnfs_layout_segment *lseg,
+                                        int idx)
+{
+        int vers = clp->cl_nfs_mod->rpc_vers->number;
+        switch (vers) {
+        case 3:
+                return ff_layout_async_handle_error_v3(task, lseg, idx);
+        case 4:
+                return ff_layout_async_handle_error_v4(task, state, clp,
+                                                       lseg, idx);
+        default:
+                /* should never happen */
+                WARN_ON_ONCE(1);
+                return 0;
+        }
+}
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+                                        int idx, u64 offset, u64 length,
+                                        u32 status, int opnum)
+{
+        struct nfs4_ff_layout_mirror *mirror;
+        int err;
+        mirror = FF_LAYOUT_COMP(lseg, idx);
+        err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+                                       mirror, offset, length, status, opnum,
+                                       GFP_NOIO);
+        dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+/* NFS_PROTO call done callback routines */
+static int ff_layout_read_done_cb(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
+{
+        struct inode *inode;
+        int err;
+        trace_nfs4_pnfs_read(hdr, task->tk_status);
+        if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+                hdr->res.op_status = NFS4ERR_NXIO;
+        if (task->tk_status < 0 && hdr->res.op_status)
+                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+                                            hdr->args.offset, hdr->args.count,
+                                            hdr->res.op_status, OP_READ);
+        err = ff_layout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg,
+                                           hdr->pgio_mirror_idx);
+        switch (err) {
+        case -NFS4ERR_RESET_TO_PNFS:
+                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                        &hdr->lseg->pls_layout->plh_flags);
+                pnfs_read_resend_pnfs(hdr);
+                return task->tk_status;
+        case -NFS4ERR_RESET_TO_MDS:
+                inode = hdr->lseg->pls_layout->plh_inode;
+                pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+                ff_layout_reset_read(hdr);
+                return task->tk_status;
+        case -EAGAIN:
+                rpc_restart_call_prepare(task);
+                return -EAGAIN;
+        }
+        return 0;
+}
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+        pnfs_set_layoutcommit(hdr);
+        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+static bool
+ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+{
+        /* No mirroring for now */
+        struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
+        return ff_layout_test_devid_unavailable(node);
+}
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+                                         struct nfs_pgio_header *hdr)
+{
+        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+                rpc_exit(task, -EIO);
+                return -EIO;
+        }
+        if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+                if (ff_layout_has_available_ds(hdr->lseg))
+                        pnfs_read_resend_pnfs(hdr);
+                else
+                        ff_layout_reset_read(hdr);
+                rpc_exit(task, 0);
+                return -EAGAIN;
+        }
+        hdr->pgio_done_cb = ff_layout_read_done_cb;
+        return 0;
+}
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_read_prepare_common(task, hdr))
+                return;
+        rpc_call_start(task);
+}
+static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
+                                    struct nfs4_sequence_args *args,
+                                    struct nfs4_sequence_res *res,
+                                    struct rpc_task *task)
+{
+        if (ds_clp->cl_session)
+                return nfs41_setup_sequence(ds_clp->cl_session,
+                                           args,
+                                           res,
+                                           task);
+        return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
+                                   args,
+                                   res,
+                                   task);
+}
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_read_prepare_common(task, hdr))
+                return;
+        if (ff_layout_setup_sequence(hdr->ds_clp,
+                                     &hdr->args.seq_args,
+                                     &hdr->res.seq_res,
+                                     task))
+                return;
+        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                        hdr->args.lock_context, FMODE_READ) == -EIO)
+                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+            task->tk_status == 0) {
+                nfs4_sequence_done(task, &hdr->res.seq_res);
+                return;
+        }
+        /* Note this may cause RPC to be resent */
+        hdr->mds_ops->rpc_call_done(task, hdr);
+}
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        rpc_count_iostats_metrics(task,
+            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+static int ff_layout_write_done_cb(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
+{
+        struct inode *inode;
+        int err;
+        trace_nfs4_pnfs_write(hdr, task->tk_status);
+        if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+                hdr->res.op_status = NFS4ERR_NXIO;
+        if (task->tk_status < 0 && hdr->res.op_status)
+                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+                                            hdr->args.offset, hdr->args.count,
+                                            hdr->res.op_status, OP_WRITE);
+        err = ff_layout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg,
+                                           hdr->pgio_mirror_idx);
+        switch (err) {
+        case -NFS4ERR_RESET_TO_PNFS:
+        case -NFS4ERR_RESET_TO_MDS:
+                inode = hdr->lseg->pls_layout->plh_inode;
+                pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+                if (err == -NFS4ERR_RESET_TO_PNFS) {
+                        pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+                        ff_layout_reset_write(hdr, true);
+                } else {
+                        pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+                        ff_layout_reset_write(hdr, false);
+                }
+                return task->tk_status;
+        case -EAGAIN:
+                rpc_restart_call_prepare(task);
+                return -EAGAIN;
+        }
+        if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+            hdr->res.verf->committed == NFS_DATA_SYNC)
+                ff_layout_set_layoutcommit(hdr);
+        return 0;
+}
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+                                     struct nfs_commit_data *data)
+{
+        struct inode *inode;
+        int err;
+        trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+        if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
+                data->res.op_status = NFS4ERR_NXIO;
+        if (task->tk_status < 0 && data->res.op_status)
+                ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+                                            data->args.offset, data->args.count,
+                                            data->res.op_status, OP_COMMIT);
+        err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+                                           data->lseg, data->ds_commit_index);
+        switch (err) {
+        case -NFS4ERR_RESET_TO_PNFS:
+        case -NFS4ERR_RESET_TO_MDS:
+                inode = data->lseg->pls_layout->plh_inode;
+                pnfs_error_mark_layout_for_return(inode, data->lseg);
+                if (err == -NFS4ERR_RESET_TO_PNFS)
+                        pnfs_set_retry_layoutget(data->lseg->pls_layout);
+                else
+                        pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+                pnfs_generic_prepare_to_resend_writes(data);
+                return -EAGAIN;
+        case -EAGAIN:
+                rpc_restart_call_prepare(task);
+                return -EAGAIN;
+        }
+        if (data->verf.committed == NFS_UNSTABLE)
+                pnfs_commit_set_layoutcommit(data);
+        return 0;
+}
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+                                          struct nfs_pgio_header *hdr)
+{
+        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+                rpc_exit(task, -EIO);
+                return -EIO;
+        }
+        if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+                bool retry_pnfs;
+                retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
+                dprintk("%s task %u reset io to %s\n", __func__,
+                        task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
+                ff_layout_reset_write(hdr, retry_pnfs);
+                rpc_exit(task, 0);
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_write_prepare_common(task, hdr))
+                return;
+        rpc_call_start(task);
+}
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_write_prepare_common(task, hdr))
+                return;
+        if (ff_layout_setup_sequence(hdr->ds_clp,
+                                     &hdr->args.seq_args,
+                                     &hdr->res.seq_res,
+                                     task))
+                return;
+        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
+                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+            task->tk_status == 0) {
+                nfs4_sequence_done(task, &hdr->res.seq_res);
+                return;
+        }
+        /* Note this may cause RPC to be resent */
+        hdr->mds_ops->rpc_call_done(task, hdr);
+}
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        rpc_count_iostats_metrics(task,
+            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+        rpc_call_start(task);
+}
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+        struct nfs_commit_data *wdata = data;
+        ff_layout_setup_sequence(wdata->ds_clp,
+                                 &wdata->args.seq_args,
+                                 &wdata->res.seq_res,
+                                 task);
+}
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_commit_data *cdata = data;
+        rpc_count_iostats_metrics(task,
+            &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+        .rpc_call_prepare = ff_layout_read_prepare_v3,
+        .rpc_call_done = ff_layout_read_call_done,
+        .rpc_count_stats = ff_layout_read_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+        .rpc_call_prepare = ff_layout_read_prepare_v4,
+        .rpc_call_done = ff_layout_read_call_done,
+        .rpc_count_stats = ff_layout_read_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+        .rpc_call_prepare = ff_layout_write_prepare_v3,
+        .rpc_call_done = ff_layout_write_call_done,
+        .rpc_count_stats = ff_layout_write_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+        .rpc_call_prepare = ff_layout_write_prepare_v4,
+        .rpc_call_done = ff_layout_write_call_done,
+        .rpc_count_stats = ff_layout_write_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+        .rpc_call_prepare = ff_layout_commit_prepare_v3,
+        .rpc_call_done = pnfs_generic_write_commit_done,
+        .rpc_count_stats = ff_layout_commit_count_stats,
+        .rpc_release = pnfs_generic_commit_release,
+};
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+        .rpc_call_prepare = ff_layout_commit_prepare_v4,
+        .rpc_call_done = pnfs_generic_write_commit_done,
+        .rpc_count_stats = ff_layout_commit_count_stats,
+        .rpc_release = pnfs_generic_commit_release,
+};
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+        struct pnfs_layout_segment *lseg = hdr->lseg;
+        struct nfs4_pnfs_ds *ds;
+        struct rpc_clnt *ds_clnt;
+        struct rpc_cred *ds_cred;
+        loff_t offset = hdr->args.offset;
+        u32 idx = hdr->pgio_mirror_idx;
+        int vers;
+        struct nfs_fh *fh;
+        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+                __func__, hdr->inode->i_ino,
+                hdr->args.pgbase, (size_t)hdr->args.count, offset);
+        ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+        if (!ds)
+                goto out_failed;
+        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+                                                   hdr->inode);
+        if (IS_ERR(ds_clnt))
+                goto out_failed;
+        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+        if (IS_ERR(ds_cred))
+                goto out_failed;
+        vers = nfs4_ff_layout_ds_version(lseg, idx);
+        dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+                ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+        atomic_inc(&ds->ds_clp->cl_count);
+        hdr->ds_clp = ds->ds_clp;
+        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+        if (fh)
+                hdr->args.fh = fh;
+        /*
+         * Note that if we ever decide to split across DSes,
+         * then we may need to handle dense-like offsets.
+         */
+        hdr->args.offset = offset;
+        hdr->mds_offset = offset;
+        /* Perform an asynchronous read to ds */
+        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+                          vers == 3 ? &ff_layout_read_call_ops_v3 :
+                                      &ff_layout_read_call_ops_v4,
+                          0, RPC_TASK_SOFTCONN);
+        return PNFS_ATTEMPTED;
+out_failed:
+        if (ff_layout_has_available_ds(lseg))
+                return PNFS_TRY_AGAIN;
+        return PNFS_NOT_ATTEMPTED;
+}
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+        struct pnfs_layout_segment *lseg = hdr->lseg;
+        struct nfs4_pnfs_ds *ds;
+        struct rpc_clnt *ds_clnt;
+        struct rpc_cred *ds_cred;
+        loff_t offset = hdr->args.offset;
+        int vers;
+        struct nfs_fh *fh;
+        int idx = hdr->pgio_mirror_idx;
+        ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+        if (!ds)
+                return PNFS_NOT_ATTEMPTED;
+        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+                                                   hdr->inode);
+        if (IS_ERR(ds_clnt))
+                return PNFS_NOT_ATTEMPTED;
+        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+        if (IS_ERR(ds_cred))
+                return PNFS_NOT_ATTEMPTED;
+        vers = nfs4_ff_layout_ds_version(lseg, idx);
+        dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+                __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+                vers);
+        hdr->pgio_done_cb = ff_layout_write_done_cb;
+        atomic_inc(&ds->ds_clp->cl_count);
+        hdr->ds_clp = ds->ds_clp;
+        hdr->ds_commit_idx = idx;
+        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+        if (fh)
+                hdr->args.fh = fh;
+        /*
+         * Note that if we ever decide to split across DSes,
+         * then we may need to handle dense-like offsets.
+         */
+        hdr->args.offset = offset;
+        /* Perform an asynchronous write */
+        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+                          vers == 3 ? &ff_layout_write_call_ops_v3 :
+                                      &ff_layout_write_call_ops_v4,
+                          sync, RPC_TASK_SOFTCONN);
+        return PNFS_ATTEMPTED;
+}
+static void
+ff_layout_mark_request_commit(struct nfs_page *req,
+                              struct pnfs_layout_segment *lseg,
+                              struct nfs_commit_info *cinfo,
+                              u32 ds_commit_idx)
+{
+        struct list_head *list;
+        struct pnfs_commit_bucket *buckets;
+        spin_lock(cinfo->lock);
+        buckets = cinfo->ds->buckets;
+        list = &buckets[ds_commit_idx].written;
+        if (list_empty(list)) {
+                /* Non-empty buckets hold a reference on the lseg.  That ref
+                 * is normally transferred to the COMMIT call and released
+                 * there.  It could also be released if the last req is pulled
+                 * off due to a rewrite, in which case it will be done in
+                 * pnfs_common_clear_request_commit
+                 */
+                WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+                buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+        }
+        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+        cinfo->ds->nwritten++;
+        /* nfs_request_add_commit_list(). We need to add req to list without
+         * dropping cinfo lock.
+         */
+        set_bit(PG_CLEAN, &(req)->wb_flags);
+        nfs_list_add_request(req, list);
+        cinfo->mds->ncommit++;
+        spin_unlock(cinfo->lock);
+        if (!cinfo->dreq) {
+                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
+                             BDI_RECLAIMABLE);
+                __mark_inode_dirty(req->wb_context->dentry->d_inode,
+                                   I_DIRTY_DATASYNC);
+        }
+}
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+        return i;
+}
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+        struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+        /* FIXME: Assume that there is only one NFS version available
+         * for the DS.
+         */
+        return &flseg->mirror_array[i]->fh_versions[0];
+}
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        struct rpc_clnt *ds_clnt;
+        struct rpc_cred *ds_cred;
+        u32 idx;
+        int vers;
+        struct nfs_fh *fh;
+        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+        ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+        if (!ds)
+                goto out_err;
+        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+                                                   data->inode);
+        if (IS_ERR(ds_clnt))
+                goto out_err;
+        ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+        if (IS_ERR(ds_cred))
+                goto out_err;
+        vers = nfs4_ff_layout_ds_version(lseg, idx);
+        dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+                data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+                vers);
+        data->commit_done_cb = ff_layout_commit_done_cb;
+        data->cred = ds_cred;
+        atomic_inc(&ds->ds_clp->cl_count);
+        data->ds_clp = ds->ds_clp;
+        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+        if (fh)
+                data->args.fh = fh;
+        return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
+                                               &ff_layout_commit_call_ops_v4,
+                                   how, RPC_TASK_SOFTCONN);
+out_err:
+        pnfs_generic_prepare_to_resend_writes(data);
+        pnfs_generic_commit_release(data);
+        return -EAGAIN;
+}
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+                           int how, struct nfs_commit_info *cinfo)
+{
+        return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+                                            ff_layout_initiate_commit);
+}
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+        struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+        if (layout == NULL)
+                return NULL;
+        return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+static void
+ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+        nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+                                                  id_node));
+}
+static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
+                                  struct xdr_stream *xdr,
+                                  const struct nfs4_layoutreturn_args *args)
+{
+        struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
+        __be32 *start;
+        int count = 0, ret = 0;
+        start = xdr_reserve_space(xdr, 4);
+        if (unlikely(!start))
+                return -E2BIG;
+        /* This assume we always return _ALL_ layouts */
+        spin_lock(&hdr->plh_inode->i_lock);
+        ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
+        spin_unlock(&hdr->plh_inode->i_lock);
+        *start = cpu_to_be32(count);
+        return ret;
+}
+/* report nothing for now */
+static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_layoutreturn_args *args)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        if (likely(p))
+                *p = cpu_to_be32(0);
+}
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+                              struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+        struct nfs4_ff_layout_ds *dsaddr;
+        dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+        if (!dsaddr)
+                return NULL;
+        return &dsaddr->id_node;
+}
+static void
+ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
+                              struct xdr_stream *xdr,
+                              const struct nfs4_layoutreturn_args *args)
+{
+        struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+        __be32 *start;
+        dprintk("%s: Begin\n", __func__);
+        start = xdr_reserve_space(xdr, 4);
+        BUG_ON(!start);
+        if (ff_layout_encode_ioerr(flo, xdr, args))
+                goto out;
+        ff_layout_encode_iostats(flo, xdr, args);
+out:
+        *start = cpu_to_be32((xdr->p - start - 1) * 4);
+        dprintk("%s: Return\n", __func__);
+}
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+        .id                     = LAYOUT_FLEX_FILES,
+        .name                   = "LAYOUT_FLEX_FILES",
+        .owner                  = THIS_MODULE,
+        .alloc_layout_hdr       = ff_layout_alloc_layout_hdr,
+        .free_layout_hdr        = ff_layout_free_layout_hdr,
+        .alloc_lseg             = ff_layout_alloc_lseg,
+        .free_lseg              = ff_layout_free_lseg,
+        .pg_read_ops            = &ff_layout_pg_read_ops,
+        .pg_write_ops           = &ff_layout_pg_write_ops,
+        .get_ds_info            = ff_layout_get_ds_info,
+        .free_deviceid_node     = ff_layout_free_deveiceid_node,
+        .mark_request_commit    = ff_layout_mark_request_commit,
+        .clear_request_commit   = pnfs_generic_clear_request_commit,
+        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
+        .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
+        .commit_pagelist        = ff_layout_commit_pagelist,
+        .read_pagelist          = ff_layout_read_pagelist,
+        .write_pagelist         = ff_layout_write_pagelist,
+        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
+        .encode_layoutreturn    = ff_layout_encode_layoutreturn,
+};
+static int __init nfs4flexfilelayout_init(void)
+{
+        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+               __func__);
+        return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+static void __exit nfs4flexfilelayout_exit(void)
+{
+        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+               __func__);
+        pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+MODULE_ALIAS("nfs-layouttype4-4");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000000..070f20445b2d
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
+/*
+ * NFSv4 flexfile layout driver data structures.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FS_NFS_NFS4FLEXFILELAYOUT_H
+#include "../pnfs.h"
+/* XXX: Let's filter out insanely large mirror count for now to avoid oom
+ * due to network error etc. */
+#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+struct nfs4_ff_ds_version {
+        u32                             version;
+        u32                             minor_version;
+        u32                             rsize;
+        u32                             wsize;
+        bool                            tightly_coupled;
+};
+/* chained in global deviceid hlist */
+struct nfs4_ff_layout_ds {
+        struct nfs4_deviceid_node       id_node;
+        u32                             ds_versions_cnt;
+        struct nfs4_ff_ds_version       *ds_versions;
+        struct nfs4_pnfs_ds             *ds;
+};
+struct nfs4_ff_layout_ds_err {
+        struct list_head                list; /* linked in mirror error_list */
+        u64                             offset;
+        u64                             length;
+        int                             status;
+        enum nfs_opnum4                 opnum;
+        nfs4_stateid                    stateid;
+        struct nfs4_deviceid            deviceid;
+};
+struct nfs4_ff_layout_mirror {
+        u32                             ds_count;
+        u32                             efficiency;
+        struct nfs4_ff_layout_ds        *mirror_ds;
+        u32                             fh_versions_cnt;
+        struct nfs_fh                   *fh_versions;
+        nfs4_stateid                    stateid;
+        struct nfs4_string              user_name;
+        struct nfs4_string              group_name;
+        u32                             uid;
+        u32                             gid;
+        struct rpc_cred                 *cred;
+        spinlock_t                      lock;
+};
+struct nfs4_ff_layout_segment {
+        struct pnfs_layout_segment      generic_hdr;
+        u64                             stripe_unit;
+        u32                             mirror_array_cnt;
+        struct nfs4_ff_layout_mirror    **mirror_array;
+};
+struct nfs4_flexfile_layout {
+        struct pnfs_layout_hdr generic_hdr;
+        struct pnfs_ds_commit_info commit_info;
+        struct list_head        error_list; /* nfs4_ff_layout_ds_err */
+};
+static inline struct nfs4_flexfile_layout *
+FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+        return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
+}
+static inline struct nfs4_ff_layout_segment *
+FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+        return container_of(lseg,
+                            struct nfs4_ff_layout_segment,
+                            generic_hdr);
+}
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+        if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
+            FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
+            FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
+                return NULL;
+        return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
+}
+static inline struct nfs4_ff_layout_ds *
+FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
+{
+        return container_of(node, struct nfs4_ff_layout_ds, id_node);
+}
+static inline struct nfs4_ff_layout_mirror *
+FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
+{
+        if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
+                return NULL;
+        return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+}
+static inline u32
+FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
+{
+        return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
+}
+static inline bool
+ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+        return nfs4_test_deviceid_unavailable(node);
+}
+static inline int
+nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+        return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+}
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+                            gfp_t gfp_flags);
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+                             struct nfs4_ff_layout_mirror *mirror, u64 offset,
+                             u64 length, int status, enum nfs_opnum4 opnum,
+                             gfp_t gfp_flags);
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+                              struct xdr_stream *xdr, int *count,
+                              const struct pnfs_layout_range *range);
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                          bool fail_return);
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
+                                 u32 ds_idx,
+                                 struct nfs_client *ds_clp,
+                                 struct inode *inode);
+struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
+                                       u32 ds_idx, struct rpc_cred *mdscred);
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000000..e2c01f204a95
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "flexfilelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+        if (mirror_ds)
+                nfs4_put_deviceid_node(&mirror_ds->id_node);
+}
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+        nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
+        nfs4_pnfs_ds_put(mirror_ds->ds);
+        kfree(mirror_ds);
+}
+/* Decode opaque device data and construct new_ds using it */
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+                            gfp_t gfp_flags)
+{
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        struct page *scratch;
+        struct list_head dsaddrs;
+        struct nfs4_pnfs_ds_addr *da;
+        struct nfs4_ff_layout_ds *new_ds = NULL;
+        struct nfs4_ff_ds_version *ds_versions = NULL;
+        u32 mp_count;
+        u32 version_count;
+        __be32 *p;
+        int i, ret = -ENOMEM;
+        /* set up xdr stream */
+        scratch = alloc_page(gfp_flags);
+        if (!scratch)
+                goto out_err;
+        new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
+        if (!new_ds)
+                goto out_scratch;
+        nfs4_init_deviceid_node(&new_ds->id_node,
+                                server,
+                                &pdev->dev_id);
+        INIT_LIST_HEAD(&dsaddrs);
+        xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        /* multipath count */
+        p = xdr_inline_decode(&stream, 4);
+        if (unlikely(!p))
+                goto out_err_drain_dsaddrs;
+        mp_count = be32_to_cpup(p);
+        dprintk("%s: multipath ds count %d\n", __func__, mp_count);
+        for (i = 0; i < mp_count; i++) {
+                /* multipath ds */
+                da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+                                            &stream, gfp_flags);
+                if (da)
+                        list_add_tail(&da->da_node, &dsaddrs);
+        }
+        if (list_empty(&dsaddrs)) {
+                dprintk("%s: no suitable DS addresses found\n",
+                        __func__);
+                ret = -ENOMEDIUM;
+                goto out_err_drain_dsaddrs;
+        }
+        /* version count */
+        p = xdr_inline_decode(&stream, 4);
+        if (unlikely(!p))
+                goto out_err_drain_dsaddrs;
+        version_count = be32_to_cpup(p);
+        dprintk("%s: version count %d\n", __func__, version_count);
+        ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
+                              gfp_flags);
+        if (!ds_versions)
+                goto out_scratch;
+        for (i = 0; i < version_count; i++) {
+                /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
+                 * tightly_coupled(4) */
+                p = xdr_inline_decode(&stream, 20);
+                if (unlikely(!p))
+                        goto out_err_drain_dsaddrs;
+                ds_versions[i].version = be32_to_cpup(p++);
+                ds_versions[i].minor_version = be32_to_cpup(p++);
+                ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
+                ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+                ds_versions[i].tightly_coupled = be32_to_cpup(p);
+                if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
+                        ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
+                if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
+                        ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
+                if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
+                        dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
+                                i, ds_versions[i].version,
+                                ds_versions[i].minor_version);
+                        ret = -EPROTONOSUPPORT;
+                        goto out_err_drain_dsaddrs;
+                }
+                dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
+                        __func__, i, ds_versions[i].version,
+                        ds_versions[i].minor_version,
+                        ds_versions[i].rsize,
+                        ds_versions[i].wsize,
+                        ds_versions[i].tightly_coupled);
+        }
+        new_ds->ds_versions = ds_versions;
+        new_ds->ds_versions_cnt = version_count;
+        new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+        if (!new_ds->ds)
+                goto out_err_drain_dsaddrs;
+        /* If DS was already in cache, free ds addrs */
+        while (!list_empty(&dsaddrs)) {
+                da = list_first_entry(&dsaddrs,
+                                      struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
+        __free_page(scratch);
+        return new_ds;
+out_err_drain_dsaddrs:
+        while (!list_empty(&dsaddrs)) {
+                da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
+        kfree(ds_versions);
+out_scratch:
+        __free_page(scratch);
+out_err:
+        kfree(new_ds);
+        dprintk("%s ERROR: returning %d\n", __func__, ret);
+        return NULL;
+}
+static u64
+end_offset(u64 start, u64 len)
+{
+        u64 end;
+        end = start + len;
+        return end >= start ? end : NFS4_MAX_UINT64;
+}
+static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
+                            u64 offset, u64 length)
+{
+        u64 end;
+        end = max_t(u64, end_offset(err->offset, err->length),
+                    end_offset(offset, length));
+        err->offset = min_t(u64, err->offset, offset);
+        err->length = end - err->offset;
+}
+static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
+                               u64 length, int status, enum nfs_opnum4 opnum,
+                               nfs4_stateid *stateid,
+                               struct nfs4_deviceid *deviceid)
+{
+        return err->status == status && err->opnum == opnum &&
+               nfs4_stateid_match(&err->stateid, stateid) &&
+               !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
+               end_offset(err->offset, err->length) >= offset &&
+               err->offset <= end_offset(offset, length);
+}
+static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
+                           struct nfs4_ff_layout_ds_err *new)
+{
+        if (!ds_error_can_merge(old, new->offset, new->length, new->status,
+                                new->opnum, &new->stateid, &new->deviceid))
+                return false;
+        extend_ds_error(old, new->offset, new->length);
+        return true;
+}
+static bool
+ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
+                              struct nfs4_ff_layout_ds_err *dserr)
+{
+        struct nfs4_ff_layout_ds_err *err;
+        list_for_each_entry(err, &flo->error_list, list) {
+                if (merge_ds_error(err, dserr)) {
+                        return true;
+                }
+        }
+        list_add(&dserr->list, &flo->error_list);
+        return false;
+}
+static bool
+ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
+                          u64 length, int status, enum nfs_opnum4 opnum,
+                          nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
+{
+        bool found = false;
+        struct nfs4_ff_layout_ds_err *err;
+        list_for_each_entry(err, &flo->error_list, list) {
+                if (ds_error_can_merge(err, offset, length, status, opnum,
+                                       stateid, deviceid)) {
+                        found = true;
+                        extend_ds_error(err, offset, length);
+                        break;
+                }
+        }
+        return found;
+}
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+                             struct nfs4_ff_layout_mirror *mirror, u64 offset,
+                             u64 length, int status, enum nfs_opnum4 opnum,
+                             gfp_t gfp_flags)
+{
+        struct nfs4_ff_layout_ds_err *dserr;
+        bool needfree;
+        if (status == 0)
+                return 0;
+        if (mirror->mirror_ds == NULL)
+                return -EINVAL;
+        spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+        if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
+                                      &mirror->stateid,
+                                      &mirror->mirror_ds->id_node.deviceid)) {
+                spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+                return 0;
+        }
+        spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+        dserr = kmalloc(sizeof(*dserr), gfp_flags);
+        if (!dserr)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&dserr->list);
+        dserr->offset = offset;
+        dserr->length = length;
+        dserr->status = status;
+        dserr->opnum = opnum;
+        nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
+        memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+               NFS4_DEVICEID4_SIZE);
+        spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+        needfree = ff_layout_add_ds_error_locked(flo, dserr);
+        spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+        if (needfree)
+                kfree(dserr);
+        return 0;
+}
+/* currently we only support AUTH_NONE and AUTH_SYS */
+static rpc_authflavor_t
+nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+{
+        if (mirror->uid == (u32)-1)
+                return RPC_AUTH_NULL;
+        return RPC_AUTH_UNIX;
+}
+/* fetch cred for NFSv3 DS */
+static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
+                                      struct nfs4_pnfs_ds *ds)
+{
+        if (ds->ds_clp && !mirror->cred &&
+            mirror->mirror_ds->ds_versions[0].version == 3) {
+                struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
+                struct rpc_cred *cred;
+                struct auth_cred acred = {
+                        .uid = make_kuid(&init_user_ns, mirror->uid),
+                        .gid = make_kgid(&init_user_ns, mirror->gid),
+                };
+                /* AUTH_NULL ignores acred */
+                cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+                if (IS_ERR(cred)) {
+                        dprintk("%s: lookup_cred failed with %ld\n",
+                                __func__, PTR_ERR(cred));
+                        return PTR_ERR(cred);
+                } else {
+                        mirror->cred = cred;
+                }
+        }
+        return 0;
+}
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+        struct nfs_fh *fh = NULL;
+        struct nfs4_deviceid_node *devid;
+        if (mirror == NULL || mirror->mirror_ds == NULL ||
+            mirror->mirror_ds->ds == NULL) {
+                printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+                        __func__, mirror_idx);
+                if (mirror && mirror->mirror_ds) {
+                        devid = &mirror->mirror_ds->id_node;
+                        pnfs_generic_mark_devid_invalid(devid);
+                }
+                goto out;
+        }
+        /* FIXME: For now assume there is only 1 version available for the DS */
+        fh = &mirror->fh_versions[0];
+out:
+        return fh;
+}
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                          bool fail_return)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+        struct nfs4_pnfs_ds *ds = NULL;
+        struct nfs4_deviceid_node *devid;
+        struct inode *ino = lseg->pls_layout->plh_inode;
+        struct nfs_server *s = NFS_SERVER(ino);
+        unsigned int max_payload;
+        rpc_authflavor_t flavor;
+        if (mirror == NULL || mirror->mirror_ds == NULL ||
+            mirror->mirror_ds->ds == NULL) {
+                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+                        __func__, ds_idx);
+                if (mirror && mirror->mirror_ds) {
+                        devid = &mirror->mirror_ds->id_node;
+                        pnfs_generic_mark_devid_invalid(devid);
+                }
+                goto out;
+        }
+        devid = &mirror->mirror_ds->id_node;
+        if (ff_layout_test_devid_unavailable(devid))
+                goto out;
+        ds = mirror->mirror_ds->ds;
+        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
+        smp_rmb();
+        if (ds->ds_clp)
+                goto out;
+        flavor = nfs4_ff_layout_choose_authflavor(mirror);
+        /* FIXME: For now we assume the server sent only one version of NFS
+         * to use for the DS.
+         */
+        nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+                             dataserver_retrans,
+                             mirror->mirror_ds->ds_versions[0].version,
+                             mirror->mirror_ds->ds_versions[0].minor_version,
+                             flavor);
+        /* connect success, check rsize/wsize limit */
+        if (ds->ds_clp) {
+                max_payload =
+                        nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
+                                       NULL);
+                if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
+                        mirror->mirror_ds->ds_versions[0].rsize = max_payload;
+                if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
+                        mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+        } else {
+                ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+                                         mirror, lseg->pls_range.offset,
+                                         lseg->pls_range.length, NFS4ERR_NXIO,
+                                         OP_ILLEGAL, GFP_NOIO);
+                if (fail_return) {
+                        pnfs_error_mark_layout_for_return(ino, lseg);
+                        if (ff_layout_has_available_ds(lseg))
+                                pnfs_set_retry_layoutget(lseg->pls_layout);
+                        else
+                                pnfs_clear_retry_layoutget(lseg->pls_layout);
+                } else {
+                        if (ff_layout_has_available_ds(lseg))
+                                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                        &lseg->pls_layout->plh_flags);
+                        else {
+                                pnfs_error_mark_layout_for_return(ino, lseg);
+                                pnfs_clear_retry_layoutget(lseg->pls_layout);
+                        }
+                }
+        }
+        if (ff_layout_update_mirror_cred(mirror, ds))
+                ds = NULL;
+out:
+        return ds;
+}
+struct rpc_cred *
+ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                      struct rpc_cred *mdscred)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+        struct rpc_cred *cred = ERR_PTR(-EINVAL);
+        if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
+                goto out;
+        if (mirror && mirror->cred)
+                cred = mirror->cred;
+        else
+                cred = mdscred;
+out:
+        return cred;
+}
+/**
+* Find or create a DS rpc client with th MDS server rpc client auth flavor
+* in the nfs_client cl_ds_clients list.
+*/
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                                 struct nfs_client *ds_clp, struct inode *inode)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+        switch (mirror->mirror_ds->ds_versions[0].version) {
+        case 3:
+                /* For NFSv3 DS, flavor is set when creating DS connections */
+                return ds_clp->cl_rpcclient;
+        case 4:
+                return nfs4_find_or_create_ds_client(ds_clp, inode);
+        default:
+                BUG();
+        }
+}
+static bool is_range_intersecting(u64 offset1, u64 length1,
+                                  u64 offset2, u64 length2)
+{
+        u64 end1 = end_offset(offset1, length1);
+        u64 end2 = end_offset(offset2, length2);
+        return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
+               (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+}
+/* called with inode i_lock held */
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+                              struct xdr_stream *xdr, int *count,
+                              const struct pnfs_layout_range *range)
+{
+        struct nfs4_ff_layout_ds_err *err, *n;
+        __be32 *p;
+        list_for_each_entry_safe(err, n, &flo->error_list, list) {
+                if (!is_range_intersecting(err->offset, err->length,
+                                           range->offset, range->length))
+                        continue;
+                /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
+                 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+                 */
+                p = xdr_reserve_space(xdr,
+                                24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+                if (unlikely(!p))
+                        return -ENOBUFS;
+                p = xdr_encode_hyper(p, err->offset);
+                p = xdr_encode_hyper(p, err->length);
+                p = xdr_encode_opaque_fixed(p, &err->stateid,
+                                            NFS4_STATEID_SIZE);
+                p = xdr_encode_opaque_fixed(p, &err->deviceid,
+                                            NFS4_DEVICEID4_SIZE);
+                *p++ = cpu_to_be32(err->status);
+                *p++ = cpu_to_be32(err->opnum);
+                *count += 1;
+                list_del(&err->list);
+                dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+                        __func__, err->offset, err->length, err->status,
+                        err->opnum, *count);
+                kfree(err);
+        }
+        return 0;
+}
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+        struct nfs4_ff_layout_mirror *mirror;
+        struct nfs4_deviceid_node *devid;
+        int idx;
+        for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+                mirror = FF_LAYOUT_COMP(lseg, idx);
+                if (mirror && mirror->mirror_ds) {
+                        devid = &mirror->mirror_ds->id_node;
+                        if (!ff_layout_test_devid_unavailable(devid))
+                                return true;
+                }
+        }
+        return false;
+}
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+                        "retries a request before it attempts further "
+                        " recovery  action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+                        "NFSv4.1  client  waits for a response from a "
+                        " data server before it retries an NFS request.");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2f5db844c172..857e2a99acc8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f
                nfs_fattr_free_group_name(fattr);
 }
-static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
 {
        unsigned long val;
        char buf[16];
@@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
        *res = val;
        return 1;
 }
+EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
 static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2211f6ba8736..e4f0dcef8f54 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -388,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                if (S_ISREG(inode->i_mode)) {
                        inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
                        inode->i_data.a_ops = &nfs_file_aops;
-                        inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
@@ -507,10 +506,15 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                attr->ia_valid &= ~ATTR_MODE;
        if (attr->ia_valid & ATTR_SIZE) {
+                loff_t i_size;
                BUG_ON(!S_ISREG(inode->i_mode));
-                if (attr->ia_size == i_size_read(inode))
+                i_size = i_size_read(inode);
+                if (attr->ia_size == i_size)
                        attr->ia_valid &= ~ATTR_SIZE;
+                else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
+                        return -ETXTBSY;
        }
        /* Optimization: if the end result is no change, don't RPC */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b6f34bfa6fe8..212b8c883d22 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -6,6 +6,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/crc32.h>
+#include <linux/nfs_page.h>
 #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
@@ -187,9 +188,15 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
                                             const struct sockaddr *ds_addr,
                                             int ds_addrlen, int ds_proto,
                                             unsigned int ds_timeo,
-                                             unsigned int ds_retrans);
+                                             unsigned int ds_retrans,
+                                             u32 minor_version,
+                                             rpc_authflavor_t au_flavor);
 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
                                                struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+                        const struct sockaddr *ds_addr, int ds_addrlen,
+                        int ds_proto, unsigned int ds_timeo,
+                        unsigned int ds_retrans, rpc_authflavor_t au_flavor);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -242,9 +249,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
 void nfs_pgio_header_free(struct nfs_pgio_header *);
 void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
-                      const struct rpc_call_ops *, int, int);
+                      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+                      const struct rpc_call_ops *call_ops, int how, int flags);
 void nfs_free_request(struct nfs_page *req);
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
 static inline void nfs_iocounter_init(struct nfs_io_counter *c)
 {
@@ -252,6 +262,12 @@ static inline void nfs_iocounter_init(struct nfs_io_counter *c)
        atomic_set(&c->io_count, 0);
 }
+static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
+{
+        WARN_ON_ONCE(desc->pg_mirror_count < 1);
+        return desc->pg_mirror_count > 1;
+}
 /* nfs2xdr.c */
 extern struct rpc_procinfo nfs_procedures[];
 extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -375,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct super_block *sb);
+extern bool nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 /* namespace.c */
@@ -414,7 +430,6 @@ int  nfs_show_options(struct seq_file *, struct dentry *);
 int  nfs_show_devname(struct seq_file *, struct dentry *);
 int  nfs_show_path(struct seq_file *, struct dentry *);
 int  nfs_show_stats(struct seq_file *, struct dentry *);
-void nfs_put_super(struct super_block *);
 int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 /* write.c */
@@ -427,6 +442,7 @@ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_initiate_commit(struct rpc_clnt *clnt,
                               struct nfs_commit_data *data,
+                               const struct nfs_rpc_ops *nfs_ops,
                               const struct rpc_call_ops *call_ops,
                               int how, int flags);
 extern void nfs_init_commit(struct nfs_commit_data *data,
@@ -440,13 +456,15 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
                    struct nfs_commit_info *cinfo);
 void nfs_mark_request_commit(struct nfs_page *req,
                             struct pnfs_layout_segment *lseg,
-                             struct nfs_commit_info *cinfo);
+                             struct nfs_commit_info *cinfo,
+                             u32 ds_commit_idx);
 int nfs_write_need_commit(struct nfs_pgio_header *);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
                      struct pnfs_layout_segment *lseg,
-                      struct nfs_commit_info *cinfo);
+                      struct nfs_commit_info *cinfo,
+                      u32 ds_commit_idx);
 void nfs_commitdata_release(struct nfs_commit_data *data);
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
                                 struct nfs_commit_info *cinfo);
@@ -457,6 +475,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
                    struct nfs_direct_req *dreq);
 int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
 bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
@@ -480,6 +499,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
        inode_dio_wait(inode);
 }
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -493,6 +513,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                struct rpc_cred *cred);
+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+        inode = igrab(inode);
+        if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
+                iput(inode);
+                inode = NULL;
+        }
+        return inode;
+}
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+        if (inode != NULL) {
+                struct super_block *sb = inode->i_sb;
+                iput(inode);
+                nfs_sb_deactive(sb);
+        }
+}
 /*
 * Determine the device name as a string
 */
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5f61b83f4a1c..b4e03ed8599d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -481,7 +481,8 @@ out_overflow:
 *              void;
 *      };
 */
-static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
+                           __u32 *op_status)
 {
        enum nfs_stat status;
        int error;
@@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
        error = decode_stat(xdr, &status);
        if (unlikely(error))
                goto out;
+        if (op_status)
+                *op_status = status;
        if (status != NFS_OK)
                goto out_default;
        error = decode_fattr(xdr, result);
@@ -808,7 +811,7 @@ out_default:
 static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 struct nfs_fattr *result)
 {
-        return decode_attrstat(xdr, result);
+        return decode_attrstat(xdr, result, NULL);
 }
 static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
        error = decode_stat(xdr, &status);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS_OK)
                goto out_default;
        error = decode_fattr(xdr, result->fattr);
@@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
 {
        /* All NFSv2 writes are "file sync" writes */
        result->verf->committed = NFS_FILE_SYNC;
-        return decode_attrstat(xdr, result->fattr);
+        return decode_attrstat(xdr, result->fattr, &result->op_status);
 }
 /**
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 333ae4068506..e134d6548ab7 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver
 struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
                                     struct nfs_fattr *, rpc_authflavor_t);
+/* nfs3super.c */
+extern struct nfs_subversion nfs_v3;
 #endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 8c1b437c5403..9e9fa347a948 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,5 +1,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
 #include "internal.h"
 #include "nfs3_fs.h"
@@ -64,3 +65,43 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
                nfs_init_server_aclclient(server);
        return server;
 }
+/*
+ * Set up a pNFS Data Server client over NFSv3.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+                const struct sockaddr *ds_addr, int ds_addrlen,
+                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+                rpc_authflavor_t au_flavor)
+{
+        struct nfs_client_initdata cl_init = {
+                .addr = ds_addr,
+                .addrlen = ds_addrlen,
+                .nfs_mod = &nfs_v3,
+                .proto = ds_proto,
+                .net = mds_clp->cl_net,
+        };
+        struct rpc_timeout ds_timeout;
+        struct nfs_client *clp;
+        char buf[INET6_ADDRSTRLEN + 1];
+        /* fake a hostname because lockd wants it */
+        if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+                return ERR_PTR(-EINVAL);
+        cl_init.hostname = buf;
+        /* Use the MDS nfs_client cl_ipaddr. */
+        nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+        clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+                             au_flavor);
+        return clp;
+}
+EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 524f9f837408..78e557c3ab87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -800,6 +800,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
        struct inode *inode = hdr->inode;
+        if (hdr->pgio_done_cb != NULL)
+                return hdr->pgio_done_cb(task, hdr);
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
@@ -825,6 +828,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
        struct inode *inode = hdr->inode;
+        if (hdr->pgio_done_cb != NULL)
+                return hdr->pgio_done_cb(task, hdr);
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
        if (task->tk_status >= 0)
@@ -845,6 +851,9 @@ static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commi
 static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 {
+        if (data->commit_done_cb != NULL)
+                return data->commit_done_cb(task, data);
        if (nfs3_async_handle_jukebox(task, data->inode))
                return -EAGAIN;
        nfs_refresh_inode(data->inode, data->res.fattr);
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 6af29c2da352..5c4394e4656b 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -7,7 +7,7 @@
 #include "nfs3_fs.h"
 #include "nfs.h"
-static struct nfs_subversion nfs_v3 = {
+struct nfs_subversion nfs_v3 = {
        .owner = THIS_MODULE,
        .nfs_fs   = &nfs_fs_type,
        .rpc_vers = &nfs_version3,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8f4cbe7f4aa8..2a932fdc57cb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
        error = decode_post_op_attr(xdr, result->fattr);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS3_OK)
                goto out_status;
        error = decode_read3resok(xdr, result);
@@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
        error = decode_wcc_data(xdr, result->fattr);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS3_OK)
                goto out_status;
        error = decode_write3resok(xdr, result);
@@ -2323,6 +2325,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
        error = decode_wcc_data(xdr, result->fattr);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS3_OK)
                goto out_status;
        error = decode_writeverf3(xdr, &result->verf->verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a08178764cf9..fdef424b0cd3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
 #define NFS4_RENEW_TIMEOUT              0x01
 #define NFS4_RENEW_DELEGATION_CB        0x02
+struct nfs_seqid_counter;
 struct nfs4_minor_version_ops {
        u32     minor_version;
        unsigned init_caps;
@@ -56,6 +57,8 @@ struct nfs4_minor_version_ops {
                        struct nfs_fsinfo *);
        void    (*free_lock_state)(struct nfs_server *,
                        struct nfs4_lock_state *);
+        struct nfs_seqid *
+                (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        const struct rpc_call_ops *call_sync_ops;
        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
        const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -443,6 +446,12 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_release_seqid(struct nfs_seqid *seqid);
 extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
+                                struct nfs4_sequence_args *args,
+                                struct nfs4_sequence_res *res,
+                                struct rpc_task *task);
+extern int nfs4_sequence_done(struct rpc_task *task,
+                              struct nfs4_sequence_res *res);
 extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 706ad10b8186..8646af9b11d2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -849,14 +849,15 @@ error:
 */
 struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
                const struct sockaddr *ds_addr, int ds_addrlen,
-                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
+                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+                u32 minor_version, rpc_authflavor_t au_flavor)
 {
        struct nfs_client_initdata cl_init = {
                .addr = ds_addr,
                .addrlen = ds_addrlen,
                .nfs_mod = &nfs_v4,
                .proto = ds_proto,
-                .minorversion = mds_clp->cl_minorversion,
+                .minorversion = minor_version,
                .net = mds_clp->cl_net,
        };
        struct rpc_timeout ds_timeout;
@@ -874,7 +875,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
         */
        nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
        clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
-                             mds_clp->cl_rpcclient->cl_auth->au_flavor);
+                             au_flavor);
        dprintk("<-- %s %p\n", __func__, clp);
        return clp;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c347705b0161..2e7c9f7a6f7c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -495,12 +495,11 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
        args->sa_privileged = 1;
 }
-static int nfs40_setup_sequence(const struct nfs_server *server,
+int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
-                                struct nfs4_sequence_args *args,
+                         struct nfs4_sequence_args *args,
-                                struct nfs4_sequence_res *res,
+                         struct nfs4_sequence_res *res,
-                                struct rpc_task *task)
+                         struct rpc_task *task)
 {
-        struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
        struct nfs4_slot *slot;
        /* slot already allocated? */
@@ -535,6 +534,7 @@ out_sleep:
        spin_unlock(&tbl->slot_tbl_lock);
        return -EAGAIN;
 }
+EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
 static int nfs40_sequence_done(struct rpc_task *task,
                               struct nfs4_sequence_res *res)
@@ -694,8 +694,7 @@ out_retry:
 }
 EXPORT_SYMBOL_GPL(nfs41_sequence_done);
-static int nfs4_sequence_done(struct rpc_task *task,
+int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
-                               struct nfs4_sequence_res *res)
 {
        if (res->sr_slot == NULL)
                return 1;
@@ -703,6 +702,7 @@ static int nfs4_sequence_done(struct rpc_task *task,
                return nfs40_sequence_done(task, res);
        return nfs41_sequence_done(task, res);
 }
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
 int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
@@ -777,7 +777,8 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
        int ret = 0;
        if (!session)
-                return nfs40_setup_sequence(server, args, res, task);
+                return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+                                            args, res, task);
        dprintk("--> %s clp %p session %p sr_slot %u\n",
                __func__, session->clp, session, res->sr_slot ?
@@ -818,14 +819,16 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
                               struct nfs4_sequence_res *res,
                               struct rpc_task *task)
 {
-        return nfs40_setup_sequence(server, args, res, task);
+        return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+                                    args, res, task);
 }
-static int nfs4_sequence_done(struct rpc_task *task,
+int nfs4_sequence_done(struct rpc_task *task,
-                               struct nfs4_sequence_res *res)
+                       struct nfs4_sequence_res *res)
 {
        return nfs40_sequence_done(task, res);
 }
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
 #endif  /* !CONFIG_NFS_V4_1 */
@@ -937,6 +940,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
        return true;
 }
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+                fmode_t fmode, int openflags)
+{
+        u32 res = 0;
+        switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+        case FMODE_READ:
+                res = NFS4_SHARE_ACCESS_READ;
+                break;
+        case FMODE_WRITE:
+                res = NFS4_SHARE_ACCESS_WRITE;
+                break;
+        case FMODE_READ|FMODE_WRITE:
+                res = NFS4_SHARE_ACCESS_BOTH;
+        }
+        if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+                goto out;
+        /* Want no delegation if we're using O_DIRECT */
+        if (openflags & O_DIRECT)
+                res |= NFS4_SHARE_WANT_NO_DELEG;
+out:
+        return res;
+}
 static enum open_claim_type4
 nfs4_map_atomic_open_claim(struct nfs_server *server,
                enum open_claim_type4 claim)
@@ -977,6 +1005,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        struct dentry *parent = dget_parent(dentry);
        struct inode *dir = parent->d_inode;
        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        struct nfs4_opendata *p;
        p = kzalloc(sizeof(*p), gfp_mask);
@@ -987,8 +1016,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        if (IS_ERR(p->f_label))
                goto err_free_p;
-        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
+        alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
-        if (p->o_arg.seqid == NULL)
+        p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
+        if (IS_ERR(p->o_arg.seqid))
                goto err_free_label;
        nfs_sb_active(dentry->d_sb);
        p->dentry = dget(dentry);
@@ -997,6 +1027,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        atomic_inc(&sp->so_count);
        p->o_arg.open_flags = flags;
        p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+        p->o_arg.share_access = nfs4_map_atomic_open_share(server,
+                        fmode, flags);
        /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
         * will return permission denied for all bits until close */
        if (!(flags & O_EXCL)) {
@@ -1167,6 +1199,16 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state,
        return false;
 }
+static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
+{
+        if (state->n_wronly)
+                set_bit(NFS_O_WRONLY_STATE, &state->flags);
+        if (state->n_rdonly)
+                set_bit(NFS_O_RDONLY_STATE, &state->flags);
+        if (state->n_rdwr)
+                set_bit(NFS_O_RDWR_STATE, &state->flags);
+}
 static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
                nfs4_stateid *stateid, fmode_t fmode)
 {
@@ -1185,8 +1227,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
        }
        if (stateid == NULL)
                return;
-        if (!nfs_need_update_open_stateid(state, stateid))
+        /* Handle races with OPEN */
+        if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
+            !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+                nfs_resync_open_stateid_locked(state);
                return;
+        }
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                nfs4_stateid_copy(&state->stateid, stateid);
        nfs4_stateid_copy(&state->open_stateid, stateid);
@@ -1281,6 +1327,23 @@ no_delegation:
        return ret;
 }
+static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
+                const nfs4_stateid *stateid)
+{
+        struct nfs4_state *state = lsp->ls_state;
+        bool ret = false;
+        spin_lock(&state->state_lock);
+        if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
+                goto out_noupdate;
+        if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
+                goto out_noupdate;
+        nfs4_stateid_copy(&lsp->ls_stateid, stateid);
+        ret = true;
+out_noupdate:
+        spin_unlock(&state->state_lock);
+        return ret;
+}
 static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
 {
@@ -1679,8 +1742,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
-        nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args,
+        nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl,
-                                &data->c_res.seq_res, task);
+                             &data->c_arg.seq_args, &data->c_res.seq_res, task);
 }
 static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -2587,6 +2650,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_EXPIRED:
+                        if (!nfs4_stateid_match(&calldata->arg.stateid,
+                                                &state->stateid)) {
+                                rpc_restart_call_prepare(task);
+                                goto out_release;
+                        }
                        if (calldata->arg.fmode == 0)
                                break;
                default:
@@ -2619,6 +2687,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
        is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
        is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+        nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
        /* Calculate the change in open mode */
        calldata->arg.fmode = 0;
        if (state->n_rdwr == 0) {
@@ -2653,6 +2722,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                        goto out_wait;
                    }
        }
+        calldata->arg.share_access =
+                nfs4_map_atomic_open_share(NFS_SERVER(inode),
+                                calldata->arg.fmode, 0);
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
@@ -2675,45 +2747,10 @@ static const struct rpc_call_ops nfs4_close_ops = {
        .rpc_release = nfs4_free_closedata,
 };
-static bool nfs4_state_has_opener(struct nfs4_state *state)
-{
-        /* first check existing openers */
-        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
-            state->n_rdonly != 0)
-                return true;
-        if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
-            state->n_wronly != 0)
-                return true;
-        if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
-            state->n_rdwr != 0)
-                return true;
-        return false;
-}
 static bool nfs4_roc(struct inode *inode)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
+        if (!nfs_have_layout(inode))
-        struct nfs_open_context *ctx;
-        struct nfs4_state *state;
-        spin_lock(&inode->i_lock);
-        list_for_each_entry(ctx, &nfsi->open_files, list) {
-                state = ctx->state;
-                if (state == NULL)
-                        continue;
-                if (nfs4_state_has_opener(state)) {
-                        spin_unlock(&inode->i_lock);
-                        return false;
-                }
-        }
-        spin_unlock(&inode->i_lock);
-        if (nfs4_check_delegation(inode, FMODE_READ))
                return false;
        return pnfs_roc(inode);
 }
@@ -2731,6 +2768,7 @@ static bool nfs4_roc(struct inode *inode)
 int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        struct nfs4_closedata *calldata;
        struct nfs4_state_owner *sp = state->owner;
        struct rpc_task *task;
@@ -2757,10 +2795,10 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
        calldata->inode = state->inode;
        calldata->state = state;
        calldata->arg.fh = NFS_FH(state->inode);
-        calldata->arg.stateid = &state->open_stateid;
        /* Serialization for the sequence id */
-        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
+        alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
-        if (calldata->arg.seqid == NULL)
+        calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
+        if (IS_ERR(calldata->arg.seqid))
                goto out_free_calldata;
        calldata->arg.fmode = 0;
        calldata->arg.bitmask = server->cache_consistency_bitmask;
@@ -5137,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 static void nfs4_delegreturn_release(void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
+        struct inode *inode = data->inode;
-        if (data->roc)
+        if (inode) {
-                pnfs_roc_release(data->inode);
+                if (data->roc)
+                        pnfs_roc_release(inode);
+                nfs_iput_and_deactive(inode);
+        }
        kfree(calldata);
 }
@@ -5196,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
-        data->inode = inode;
+        data->inode = nfs_igrab_and_active(inode);
-        data->roc = list_empty(&NFS_I(inode)->open_files) ?
+        if (data->inode)
-                    pnfs_roc(inode) : false;
+                data->roc = nfs4_roc(inode);
        task_setup_data.callback_data = data;
        msg.rpc_argp = &data->args;
@@ -5353,7 +5395,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        p->arg.fl = &p->fl;
        p->arg.seqid = seqid;
        p->res.seqid = seqid;
-        p->arg.stateid = &lsp->ls_stateid;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
        /* Ensure we don't close file until we're done freeing locks! */
@@ -5380,14 +5421,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                return;
        switch (task->tk_status) {
                case 0:
-                        nfs4_stateid_copy(&calldata->lsp->ls_stateid,
-                                        &calldata->res.stateid);
                        renew_lease(calldata->server, calldata->timestamp);
-                        break;
+                        do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
+                        if (nfs4_update_lock_stateid(calldata->lsp,
+                                        &calldata->res.stateid))
+                                break;
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
+                        if (!nfs4_stateid_match(&calldata->arg.stateid,
+                                                &calldata->lsp->ls_stateid))
+                                rpc_restart_call_prepare(task);
                        break;
                default:
                        if (nfs4_async_handle_error(task, calldata->server,
@@ -5403,6 +5448,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
                goto out_wait;
+        nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
        if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
                /* Note: exit _without_ running nfs4_locku_done */
                goto out_no_action;
@@ -5473,6 +5519,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        struct nfs_seqid *seqid;
        struct nfs4_lock_state *lsp;
        struct rpc_task *task;
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        int status = 0;
        unsigned char fl_flags = request->fl_flags;
@@ -5496,9 +5543,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        lsp = request->fl_u.nfs4_fl.owner;
        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
                goto out;
-        seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
+        alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
+        seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
        status = -ENOMEM;
-        if (seqid == NULL)
+        if (IS_ERR(seqid))
                goto out;
        task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
        status = PTR_ERR(task);
@@ -5531,6 +5579,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        struct nfs4_lockdata *p;
        struct inode *inode = lsp->ls_state->inode;
        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
@@ -5539,12 +5588,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
-        if (p->arg.open_seqid == NULL)
+        if (IS_ERR(p->arg.open_seqid))
                goto out_free;
-        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
+        alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
-        if (p->arg.lock_seqid == NULL)
+        p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
+        if (IS_ERR(p->arg.lock_seqid))
                goto out_free_seqid;
-        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
        p->arg.lock_owner.s_dev = server->s_dev;
@@ -5571,15 +5620,19 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
                goto out_wait;
        /* Do we need to do an open_to_lock_owner? */
-        if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
+        if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
                if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
                        goto out_release_lock_seqid;
                }
-                data->arg.open_stateid = &state->open_stateid;
+                nfs4_stateid_copy(&data->arg.open_stateid,
+                                &state->open_stateid);
                data->arg.new_lock_owner = 1;
                data->res.open_seqid = data->arg.open_seqid;
-        } else
+        } else {
                data->arg.new_lock_owner = 0;
+                nfs4_stateid_copy(&data->arg.lock_stateid,
+                                &data->lsp->ls_stateid);
+        }
        if (!nfs4_valid_open_stateid(state)) {
                data->rpc_status = -EBADF;
                task->tk_action = NULL;
@@ -5603,6 +5656,7 @@ out_wait:
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_lockdata *data = calldata;
+        struct nfs4_lock_state *lsp = data->lsp;
        dprintk("%s: begin!\n", __func__);
@@ -5610,18 +5664,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
                return;
        data->rpc_status = task->tk_status;
-        if (data->arg.new_lock_owner != 0) {
+        switch (task->tk_status) {
-                if (data->rpc_status == 0)
+        case 0:
-                        nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
+                renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
-                else
+                                data->timestamp);
-                        goto out;
+                if (data->arg.new_lock) {
-        }
+                        data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
-        if (data->rpc_status == 0) {
+                        if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
-                nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
+                                rpc_restart_call_prepare(task);
-                set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags);
+                                break;
-                renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
+                        }
+                }
+                if (data->arg.new_lock_owner != 0) {
+                        nfs_confirm_seqid(&lsp->ls_seqid, 0);
+                        nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
+                        set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+                } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
+                        rpc_restart_call_prepare(task);
+                break;
+        case -NFS4ERR_BAD_STATEID:
+        case -NFS4ERR_OLD_STATEID:
+        case -NFS4ERR_STALE_STATEID:
+        case -NFS4ERR_EXPIRED:
+                if (data->arg.new_lock_owner != 0) {
+                        if (!nfs4_stateid_match(&data->arg.open_stateid,
+                                                &lsp->ls_state->open_stateid))
+                                rpc_restart_call_prepare(task);
+                } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
+                                                &lsp->ls_stateid))
+                                rpc_restart_call_prepare(task);
        }
-out:
        dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
 }
@@ -5702,7 +5774,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                if (recovery_type == NFS_LOCK_RECLAIM)
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                nfs4_set_sequence_privileged(&data->arg.seq_args);
-        }
+        } else
+                data->arg.new_lock = 1;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -5826,10 +5899,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs4_state_owner *sp = state->owner;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        unsigned char fl_flags = request->fl_flags;
-        unsigned int seq;
        int status = -ENOLCK;
        if ((fl_flags & FL_POSIX) &&
@@ -5849,25 +5920,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                /* ...but avoid races with delegation recall... */
                request->fl_flags = fl_flags & ~FL_SLEEP;
                status = do_vfs_lock(request->fl_file, request);
-                goto out_unlock;
+                up_read(&nfsi->rwsem);
-        }
-        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
-        up_read(&nfsi->rwsem);
-        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
-        if (status != 0)
                goto out;
-        down_read(&nfsi->rwsem);
-        if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
-                status = -NFS4ERR_DELAY;
-                goto out_unlock;
        }
-        /* Note: we always want to sleep here! */
-        request->fl_flags = fl_flags | FL_SLEEP;
-        if (do_vfs_lock(request->fl_file, request) < 0)
-                printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
-                        "manager!\n", __func__);
-out_unlock:
        up_read(&nfsi->rwsem);
+        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 out:
        request->fl_flags = fl_flags;
        return status;
@@ -5974,8 +6031,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
 {
        struct nfs_release_lockowner_data *data = calldata;
        struct nfs_server *server = data->server;
-        nfs40_setup_sequence(server, &data->args.seq_args,
+        nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
-                                &data->res.seq_res, task);
+                             &data->args.seq_args, &data->res.seq_res, task);
        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
        data->timestamp = jiffies;
 }
@@ -7537,6 +7594,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
                return;
        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
                                          NFS_I(lgp->args.inode)->layout,
+                                          &lgp->args.range,
                                          lgp->args.ctx->state)) {
                rpc_exit(task, NFS4_OK);
        }
@@ -7792,9 +7850,13 @@ static void nfs4_layoutreturn_release(void *calldata)
        spin_lock(&lo->plh_inode->i_lock);
        if (lrp->res.lrs_present)
                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+        pnfs_clear_layoutreturn_waitbit(lo);
+        clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
+        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
        lo->plh_block_lgets--;
        spin_unlock(&lo->plh_inode->i_lock);
        pnfs_put_layout_hdr(lrp->args.layout);
+        nfs_iput_and_deactive(lrp->inode);
        kfree(calldata);
        dprintk("<-- %s\n", __func__);
 }
@@ -7805,7 +7867,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
        .rpc_release = nfs4_layoutreturn_release,
 };
-int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
 {
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -7820,14 +7882,23 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
                .callback_ops = &nfs4_layoutreturn_call_ops,
                .callback_data = lrp,
        };
-        int status;
+        int status = 0;
        dprintk("--> %s\n", __func__);
+        if (!sync) {
+                lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+                if (!lrp->inode) {
+                        nfs4_layoutreturn_release(lrp);
+                        return -EAGAIN;
+                }
+                task_setup_data.flags |= RPC_TASK_ASYNC;
+        }
        nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
-        status = task->tk_status;
+        if (sync)
+                status = task->tk_status;
        trace_nfs4_layoutreturn(lrp->args.inode, status);
        dprintk("<-- %s status=%d\n", __func__, status);
        rpc_put_task(task);
@@ -7921,6 +7992,7 @@ static void nfs4_layoutcommit_release(void *calldata)
        nfs_post_op_update_inode_force_wcc(data->args.inode,
                                           data->res.fattr);
        put_rpccred(data->cred);
+        nfs_iput_and_deactive(data->inode);
        kfree(data);
 }
@@ -7945,7 +8017,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
                .rpc_message = &msg,
                .callback_ops = &nfs4_layoutcommit_ops,
                .callback_data = data,
-                .flags = RPC_TASK_ASYNC,
        };
        struct rpc_task *task;
        int status = 0;
@@ -7956,18 +8027,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
                data->args.lastbytewritten,
                data->args.inode->i_ino);
+        if (!sync) {
+                data->inode = nfs_igrab_and_active(data->args.inode);
+                if (data->inode == NULL) {
+                        nfs4_layoutcommit_release(data);
+                        return -EAGAIN;
+                }
+                task_setup_data.flags = RPC_TASK_ASYNC;
+        }
        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
-        if (sync == false)
+        if (sync)
-                goto out;
+                status = task->tk_status;
-        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0)
-                goto out;
-        status = task->tk_status;
        trace_nfs4_layoutcommit(data->args.inode, status);
-out:
        dprintk("%s: status %d\n", __func__, status);
        rpc_put_task(task);
        return status;
@@ -8395,6 +8469,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
        .match_stateid = nfs4_match_stateid,
        .find_root_sec = nfs4_find_root_sec,
        .free_lock_state = nfs4_release_lockowner,
+        .alloc_seqid = nfs_alloc_seqid,
        .call_sync_ops = &nfs40_call_sync_ops,
        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -8403,6 +8478,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 };
 #if defined(CONFIG_NFS_V4_1)
+static struct nfs_seqid *
+nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
+{
+        return NULL;
+}
 static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .minor_version = 1,
        .init_caps = NFS_CAP_READDIRPLUS
@@ -8416,6 +8497,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .match_stateid = nfs41_match_stateid,
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
+        .alloc_seqid = nfs_alloc_no_seqid,
        .call_sync_ops = &nfs41_call_sync_ops,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8442,6 +8524,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
        .call_sync_ops = &nfs41_call_sync_ops,
+        .alloc_seqid = nfs_alloc_no_seqid,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
        .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5194933ed419..5ad908e9ce9c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1003,11 +1003,11 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
        struct nfs_seqid *new;
        new = kmalloc(sizeof(*new), gfp_mask);
-        if (new != NULL) {
+        if (new == NULL)
-                new->sequence = counter;
+                return ERR_PTR(-ENOMEM);
-                INIT_LIST_HEAD(&new->list);
+        new->sequence = counter;
-                new->task = NULL;
+        INIT_LIST_HEAD(&new->list);
-        }
+        new->task = NULL;
        return new;
 }
@@ -1015,7 +1015,7 @@ void nfs_release_seqid(struct nfs_seqid *seqid)
 {
        struct nfs_seqid_counter *sequence;
-        if (list_empty(&seqid->list))
+        if (seqid == NULL || list_empty(&seqid->list))
                return;
        sequence = seqid->sequence;
        spin_lock(&sequence->lock);
@@ -1071,13 +1071,15 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 {
-        struct nfs4_state_owner *sp = container_of(seqid->sequence,
+        struct nfs4_state_owner *sp;
-                                        struct nfs4_state_owner, so_seqid);
-        struct nfs_server *server = sp->so_server;
+        if (seqid == NULL)
+                return;
+        sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
        if (status == -NFS4ERR_BAD_SEQID)
                nfs4_drop_state_owner(sp);
-        if (!nfs4_has_session(server->nfs_client))
+        if (!nfs4_has_session(sp->so_server->nfs_client))
                nfs_increment_seqid(status, seqid);
 }
@@ -1088,14 +1090,18 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 */
 void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
 {
-        nfs_increment_seqid(status, seqid);
+        if (seqid != NULL)
+                nfs_increment_seqid(status, seqid);
 }
 int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
 {
-        struct nfs_seqid_counter *sequence = seqid->sequence;
+        struct nfs_seqid_counter *sequence;
        int status = 0;
+        if (seqid == NULL)
+                goto out;
+        sequence = seqid->sequence;
        spin_lock(&sequence->lock);
        seqid->task = task;
        if (list_empty(&seqid->list))
@@ -1106,6 +1112,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
        status = -EAGAIN;
 unlock:
        spin_unlock(&sequence->lock);
+out:
        return status;
 }
@@ -1366,49 +1373,55 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        struct nfs_inode *nfsi = NFS_I(inode);
        struct file_lock *fl;
        int status = 0;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct list_head *list;
-        if (inode->i_flock == NULL)
+        if (flctx == NULL)
                return 0;
+        list = &flctx->flc_posix;
        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
-        /* Protect inode->i_flock using the BKL */
+        spin_lock(&flctx->flc_lock);
-        spin_lock(&inode->i_lock);
+restart:
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+        list_for_each_entry(fl, list, fl_list) {
-                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
-                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&flctx->flc_lock);
                status = ops->recover_lock(state, fl);
                switch (status) {
-                        case 0:
+                case 0:
-                                break;
+                        break;
-                        case -ESTALE:
+                case -ESTALE:
-                        case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_ADMIN_REVOKED:
-                        case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_STALE_STATEID:
-                        case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_BAD_STATEID:
-                        case -NFS4ERR_EXPIRED:
+                case -NFS4ERR_EXPIRED:
-                        case -NFS4ERR_NO_GRACE:
+                case -NFS4ERR_NO_GRACE:
-                        case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_STALE_CLIENTID:
-                        case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSESSION:
-                        case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BADSLOT:
-                        case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
-                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-                                goto out;
+                        goto out;
-                        default:
+                default:
-                                printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+                        pr_err("NFS: %s: unhandled error %d\n",
-                                         __func__, status);
+                                        __func__, status);
-                        case -ENOMEM:
+                case -ENOMEM:
-                        case -NFS4ERR_DENIED:
+                case -NFS4ERR_DENIED:
-                        case -NFS4ERR_RECLAIM_BAD:
+                case -NFS4ERR_RECLAIM_BAD:
-                        case -NFS4ERR_RECLAIM_CONFLICT:
+                case -NFS4ERR_RECLAIM_CONFLICT:
-                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
+                        /* kill_proc(fl->fl_pid, SIGLOST, 1); */
-                                status = 0;
+                        status = 0;
                }
-                spin_lock(&inode->i_lock);
+                spin_lock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
+        if (list == &flctx->flc_posix) {
+                list = &flctx->flc_flock;
+                goto restart;
+        }
+        spin_unlock(&flctx->flc_lock);
 out:
        up_write(&nfsi->rwsem);
        return status;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 6f340f02f2ba..75090feeafad 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = {
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs4_write_inode,
        .drop_inode     = nfs_drop_inode,
-        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
@@ -346,6 +345,9 @@ out:
 static void __exit exit_nfs_v4(void)
 {
+        /* Not called in the _init(), conditionally loaded */
+        nfs4_pnfs_v3_ds_connect_unload();
        unregister_nfs_version(&nfs_v4);
        nfs4_unregister_sysctl();
        nfs_idmap_quit();
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..e23a0a664e12 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -946,7 +946,10 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n)
 static void encode_nfs4_seqid(struct xdr_stream *xdr,
                const struct nfs_seqid *seqid)
 {
-        encode_uint32(xdr, seqid->sequence->counter);
+        if (seqid != NULL)
+                encode_uint32(xdr, seqid->sequence->counter);
+        else
+                encode_uint32(xdr, 0);
 }
 static void encode_compound_hdr(struct xdr_stream *xdr,
@@ -1125,7 +1128,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 {
        encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
        encode_nfs4_seqid(xdr, arg->seqid);
-        encode_nfs4_stateid(xdr, arg->stateid);
+        encode_nfs4_stateid(xdr, &arg->stateid);
 }
 static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
@@ -1301,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
        *p = cpu_to_be32(args->new_lock_owner);
        if (args->new_lock_owner){
                encode_nfs4_seqid(xdr, args->open_seqid);
-                encode_nfs4_stateid(xdr, args->open_stateid);
+                encode_nfs4_stateid(xdr, &args->open_stateid);
                encode_nfs4_seqid(xdr, args->lock_seqid);
                encode_lockowner(xdr, &args->lock_owner);
        }
        else {
-                encode_nfs4_stateid(xdr, args->lock_stateid);
+                encode_nfs4_stateid(xdr, &args->lock_stateid);
                encode_nfs4_seqid(xdr, args->lock_seqid);
        }
 }
@@ -1330,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
        encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
        encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
        encode_nfs4_seqid(xdr, args->seqid);
-        encode_nfs4_stateid(xdr, args->stateid);
+        encode_nfs4_stateid(xdr, &args->stateid);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->fl->fl_start);
        xdr_encode_hyper(p, nfs4_lock_length(args->fl));
@@ -1348,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
        encode_string(xdr, name->len, name->name);
 }
-static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
+static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
 {
        __be32 *p;
        p = reserve_space(xdr, 8);
-        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+        *p++ = cpu_to_be32(share_access);
-        case FMODE_READ:
-                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
-                break;
-        case FMODE_WRITE:
-                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
-                break;
-        case FMODE_READ|FMODE_WRITE:
-                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
-                break;
-        default:
-                *p++ = cpu_to_be32(0);
-        }
        *p = cpu_to_be32(0);            /* for linux, share_deny = 0 always */
 }
@@ -1377,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 * owner 4 = 32
 */
        encode_nfs4_seqid(xdr, arg->seqid);
-        encode_share_access(xdr, arg->fmode);
+        encode_share_access(xdr, arg->share_access);
        p = reserve_space(xdr, 36);
        p = xdr_encode_hyper(p, arg->clientid);
        *p++ = cpu_to_be32(24);
@@ -1530,9 +1521,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
-        encode_nfs4_stateid(xdr, arg->stateid);
+        encode_nfs4_stateid(xdr, &arg->stateid);
        encode_nfs4_seqid(xdr, arg->seqid);
-        encode_share_access(xdr, arg->fmode);
+        encode_share_access(xdr, arg->share_access);
 }
 static void
@@ -1801,9 +1792,8 @@ static void encode_create_session(struct xdr_stream *xdr,
                                  struct compound_hdr *hdr)
 {
        __be32 *p;
-        char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
-        uint32_t len;
        struct nfs_client *clp = args->client;
+        struct rpc_clnt *clnt = clp->cl_rpcclient;
        struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
        u32 max_resp_sz_cached;
@@ -1814,11 +1804,8 @@ static void encode_create_session(struct xdr_stream *xdr,
        max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
                              RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
-        len = scnprintf(machine_name, sizeof(machine_name), "%s",
-                        clp->cl_ipaddr);
        encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
-        p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12);
+        p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
        p = xdr_encode_hyper(p, clp->cl_clientid);
        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
        *p++ = cpu_to_be32(args->flags);                        /*flags */
@@ -1847,7 +1834,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        /* authsys_parms rfc1831 */
        *p++ = cpu_to_be32(nn->boot_time.tv_nsec);      /* stamp */
-        p = xdr_encode_opaque(p, machine_name, len);
+        p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
        *p++ = cpu_to_be32(0);                          /* UID */
        *p++ = cpu_to_be32(0);                          /* GID */
        *p = cpu_to_be32(0);                            /* No more gids */
@@ -2012,11 +1999,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
        p = reserve_space(xdr, 16);
        *p++ = cpu_to_be32(0);          /* reclaim. always 0 for now */
        *p++ = cpu_to_be32(args->layout_type);
-        *p++ = cpu_to_be32(IOMODE_ANY);
+        *p++ = cpu_to_be32(args->range.iomode);
        *p = cpu_to_be32(RETURN_FILE);
        p = reserve_space(xdr, 16);
-        p = xdr_encode_hyper(p, 0);
+        p = xdr_encode_hyper(p, args->range.offset);
-        p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+        p = xdr_encode_hyper(p, args->range.length);
        spin_lock(&args->inode->i_lock);
        encode_nfs4_stateid(xdr, &args->stateid);
        spin_unlock(&args->inode->i_lock);
@@ -4936,20 +4923,13 @@ out_overflow:
        return -EIO;
 }
-static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+static int decode_rw_delegation(struct xdr_stream *xdr,
+                uint32_t delegation_type,
+                struct nfs_openres *res)
 {
        __be32 *p;
-        uint32_t delegation_type;
        int status;
-        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
-                goto out_overflow;
-        delegation_type = be32_to_cpup(p);
-        if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
-                res->delegation_type = 0;
-                return 0;
-        }
        status = decode_stateid(xdr, &res->delegation);
        if (unlikely(status))
                return status;
@@ -4973,6 +4953,52 @@ out_overflow:
        return -EIO;
 }
+static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+        __be32 *p;
+        uint32_t why_no_delegation;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        why_no_delegation = be32_to_cpup(p);
+        switch (why_no_delegation) {
+                case WND4_CONTENTION:
+                case WND4_RESOURCE:
+                        xdr_inline_decode(xdr, 4);
+                        /* Ignore for now */
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+        __be32 *p;
+        uint32_t delegation_type;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        delegation_type = be32_to_cpup(p);
+        res->delegation_type = 0;
+        switch (delegation_type) {
+        case NFS4_OPEN_DELEGATE_NONE:
+                return 0;
+        case NFS4_OPEN_DELEGATE_READ:
+        case NFS4_OPEN_DELEGATE_WRITE:
+                return decode_rw_delegation(xdr, delegation_type, res);
+        case NFS4_OPEN_DELEGATE_NONE_EXT:
+                return decode_no_delegation(xdr, res);
+        }
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
        __be32 *p;
@@ -6567,6 +6593,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        int status;
        status = decode_compound_hdr(xdr, &hdr);
+        res->op_status = hdr.status;
        if (status)
                goto out;
        status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6592,6 +6619,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        int status;
        status = decode_compound_hdr(xdr, &hdr);
+        res->op_status = hdr.status;
        if (status)
                goto out;
        status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6621,6 +6649,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        int status;
        status = decode_compound_hdr(xdr, &hdr);
+        res->op_status = hdr.status;
        if (status)
                goto out;
        status = decode_sequence(xdr, &res->seq_res, rqstp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index cd3c910d2d12..9bc9f04fb7f6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -261,11 +261,11 @@ static int __init root_nfs_data(char *cmdline)
         */
        len = snprintf(nfs_export_path, sizeof(nfs_export_path),
                                tmp, utsname()->nodename);
-        if (len > (int)sizeof(nfs_export_path))
+        if (len >= (int)sizeof(nfs_export_path))
                goto out_devnametoolong;
        len = snprintf(nfs_root_device, sizeof(nfs_root_device),
                                "%pI4:%s", &servaddr, nfs_export_path);
-        if (len > (int)sizeof(nfs_root_device))
+        if (len >= (int)sizeof(nfs_root_device))
                goto out_devnametoolong;
        retval = 0;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9e5bc42180e4..24e1d7403c0b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
                          struct nfs_page *prev, struct nfs_page *req)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
        unsigned int size;
        size = pnfs_generic_pg_test(pgio, prev, req);
-        if (!size || pgio->pg_count + req->wb_bytes >
+        if (!size || mirror->pg_count + req->wb_bytes >
            (unsigned long)pgio->pg_layout_private)
                return 0;
@@ -607,12 +608,14 @@ static const struct nfs_pageio_ops objio_pg_read_ops = {
        .pg_init = objio_init_read,
        .pg_test = objio_pg_test,
        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static const struct nfs_pageio_ops objio_pg_write_ops = {
        .pg_init = objio_init_write,
        .pg_test = objio_pg_test,
        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static struct pnfs_layoutdriver_type objlayout_type = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2b5e769beb16..d57190a0d533 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -42,21 +42,35 @@ static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
        return p->pagevec != NULL;
 }
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
+{
+        return nfs_pgio_has_mirroring(desc) ?
+                &desc->pg_mirrors[desc->pg_mirror_idx] :
+                &desc->pg_mirrors[0];
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
 void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
                       struct nfs_pgio_header *hdr,
                       void (*release)(struct nfs_pgio_header *hdr))
 {
-        hdr->req = nfs_list_entry(desc->pg_list.next);
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+        hdr->req = nfs_list_entry(mirror->pg_list.next);
        hdr->inode = desc->pg_inode;
        hdr->cred = hdr->req->wb_context->cred;
        hdr->io_start = req_offset(hdr->req);
-        hdr->good_bytes = desc->pg_count;
+        hdr->good_bytes = mirror->pg_count;
        hdr->dreq = desc->pg_dreq;
        hdr->layout_private = desc->pg_layout_private;
        hdr->release = release;
        hdr->completion_ops = desc->pg_completion_ops;
        if (hdr->completion_ops->init_hdr)
                hdr->completion_ops->init_hdr(hdr);
+        hdr->pgio_mirror_idx = desc->pg_mirror_idx;
 }
 EXPORT_SYMBOL_GPL(nfs_pgheader_init);
@@ -480,7 +494,10 @@ nfs_wait_on_request(struct nfs_page *req)
 size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *prev, struct nfs_page *req)
 {
-        if (desc->pg_count > desc->pg_bsize) {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+        if (mirror->pg_count > mirror->pg_bsize) {
                /* should never happen */
                WARN_ON_ONCE(1);
                return 0;
@@ -490,11 +507,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
         * Limit the request size so that we can still allocate a page array
         * for it without upsetting the slab allocator.
         */
-        if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+        if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
                        sizeof(struct page) > PAGE_SIZE)
                return 0;
-        return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
+        return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
@@ -597,13 +614,14 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 }
 int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+                      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
                      const struct rpc_call_ops *call_ops, int how, int flags)
 {
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &hdr->args,
                .rpc_resp = &hdr->res,
-                .rpc_cred = hdr->cred,
+                .rpc_cred = cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = clnt,
@@ -616,7 +634,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
        };
        int ret = 0;
-        hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
+        hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
        dprintk("NFS: %5u initiated pgio call "
                "(req %s/%llu, %u bytes @ offset %llu)\n",
@@ -650,10 +668,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
 static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
                          struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror;
+        u32 midx;
        set_bit(NFS_IOHDR_REDO, &hdr->flags);
        nfs_pgio_data_destroy(hdr);
        hdr->completion_ops->completion(hdr);
-        desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+        /* TODO: Make sure it's right to clean up all mirrors here
+         *       and not just hdr->pgio_mirror_idx */
+        for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+                mirror = &desc->pg_mirrors[midx];
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+        }
        return -ENOMEM;
 }
@@ -670,6 +696,17 @@ static void nfs_pgio_release(void *calldata)
        hdr->completion_ops->completion(hdr);
 }
+static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
+                                   unsigned int bsize)
+{
+        INIT_LIST_HEAD(&mirror->pg_list);
+        mirror->pg_bytes_written = 0;
+        mirror->pg_count = 0;
+        mirror->pg_bsize = bsize;
+        mirror->pg_base = 0;
+        mirror->pg_recoalesce = 0;
+}
 /**
 * nfs_pageio_init - initialise a page io descriptor
 * @desc: pointer to descriptor
@@ -686,13 +723,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                     size_t bsize,
                     int io_flags)
 {
-        INIT_LIST_HEAD(&desc->pg_list);
+        struct nfs_pgio_mirror *new;
-        desc->pg_bytes_written = 0;
+        int i;
-        desc->pg_count = 0;
-        desc->pg_bsize = bsize;
-        desc->pg_base = 0;
        desc->pg_moreio = 0;
-        desc->pg_recoalesce = 0;
        desc->pg_inode = inode;
        desc->pg_ops = pg_ops;
        desc->pg_completion_ops = compl_ops;
@@ -702,6 +736,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_lseg = NULL;
        desc->pg_dreq = NULL;
        desc->pg_layout_private = NULL;
+        desc->pg_bsize = bsize;
+        desc->pg_mirror_count = 1;
+        desc->pg_mirror_idx = 0;
+        if (pg_ops->pg_get_mirror_count) {
+                /* until we have a request, we don't have an lseg and no
+                 * idea how many mirrors there will be */
+                new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
+                              sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
+                desc->pg_mirrors_dynamic = new;
+                desc->pg_mirrors = new;
+                for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
+                        nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
+        } else {
+                desc->pg_mirrors_dynamic = NULL;
+                desc->pg_mirrors = desc->pg_mirrors_static;
+                nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+        }
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init);
@@ -737,14 +791,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
 int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                     struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_page         *req;
        struct page             **pages,
                                *last_page;
-        struct list_head *head = &desc->pg_list;
+        struct list_head *head = &mirror->pg_list;
        struct nfs_commit_info cinfo;
        unsigned int pagecount, pageused;
-        pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+        pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
        if (!nfs_pgarray_set(&hdr->page_array, pagecount))
                return nfs_pgio_error(desc, hdr);
@@ -772,7 +828,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
        /* Set up the argument struct */
-        nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+        nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
        desc->pg_rpc_callops = &nfs_pgio_common_ops;
        return 0;
 }
@@ -780,23 +836,74 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror;
        struct nfs_pgio_header *hdr;
        int ret;
+        mirror = nfs_pgio_current_mirror(desc);
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+                /* TODO: make sure this is right with mirroring - or
+                 *       should it back out all mirrors? */
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
                return -ENOMEM;
        }
        nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret == 0)
                ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
-                                        hdr, desc->pg_rpc_callops,
+                                        hdr,
+                                        hdr->cred,
+                                        NFS_PROTO(hdr->inode),
+                                        desc->pg_rpc_callops,
                                        desc->pg_ioflags, 0);
        return ret;
 }
+/*
+ * nfs_pageio_setup_mirroring - determine if mirroring is to be used
+ *                              by calling the pg_get_mirror_count op
+ */
+static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+                                       struct nfs_page *req)
+{
+        int mirror_count = 1;
+        if (!pgio->pg_ops->pg_get_mirror_count)
+                return 0;
+        mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+        if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
+                return -EINVAL;
+        if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
+                return -EINVAL;
+        pgio->pg_mirror_count = mirror_count;
+        return 0;
+}
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+        pgio->pg_mirror_count = 1;
+        pgio->pg_mirror_idx = 0;
+}
+static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+        pgio->pg_mirror_count = 1;
+        pgio->pg_mirror_idx = 0;
+        pgio->pg_mirrors = pgio->pg_mirrors_static;
+        kfree(pgio->pg_mirrors_dynamic);
+        pgio->pg_mirrors_dynamic = NULL;
+}
 static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
                const struct nfs_open_context *ctx2)
 {
@@ -826,11 +933,15 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
                                      struct nfs_pageio_descriptor *pgio)
 {
        size_t size;
+        struct file_lock_context *flctx;
        if (prev) {
                if (!nfs_match_open_context(req->wb_context, prev->wb_context))
                        return false;
-                if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+                flctx = req->wb_context->dentry->d_inode->i_flctx;
+                if (flctx != NULL &&
+                    !(list_empty_careful(&flctx->flc_posix) &&
+                      list_empty_careful(&flctx->flc_flock)) &&
                    !nfs_match_lock_context(req->wb_lock_context,
                                            prev->wb_lock_context))
                        return false;
@@ -863,19 +974,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                                     struct nfs_page *req)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_page *prev = NULL;
-        if (desc->pg_count != 0) {
-                prev = nfs_list_entry(desc->pg_list.prev);
+        if (mirror->pg_count != 0) {
+                prev = nfs_list_entry(mirror->pg_list.prev);
        } else {
                if (desc->pg_ops->pg_init)
                        desc->pg_ops->pg_init(desc, req);
-                desc->pg_base = req->wb_pgbase;
+                mirror->pg_base = req->wb_pgbase;
        }
        if (!nfs_can_coalesce_requests(prev, req, desc))
                return 0;
        nfs_list_remove_request(req);
-        nfs_list_add_request(req, &desc->pg_list);
+        nfs_list_add_request(req, &mirror->pg_list);
-        desc->pg_count += req->wb_bytes;
+        mirror->pg_count += req->wb_bytes;
        return 1;
 }
@@ -884,16 +998,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 */
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
-        if (!list_empty(&desc->pg_list)) {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+        if (!list_empty(&mirror->pg_list)) {
                int error = desc->pg_ops->pg_doio(desc);
                if (error < 0)
                        desc->pg_error = error;
                else
-                        desc->pg_bytes_written += desc->pg_count;
+                        mirror->pg_bytes_written += mirror->pg_count;
        }
-        if (list_empty(&desc->pg_list)) {
+        if (list_empty(&mirror->pg_list)) {
-                desc->pg_count = 0;
+                mirror->pg_count = 0;
-                desc->pg_base = 0;
+                mirror->pg_base = 0;
        }
 }
@@ -911,6 +1028,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *req)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_page *subreq;
        unsigned int bytes_left = 0;
        unsigned int offset, pgbase;
@@ -934,7 +1053,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        nfs_pageio_doio(desc);
                        if (desc->pg_error < 0)
                                return 0;
-                        if (desc->pg_recoalesce)
+                        if (mirror->pg_recoalesce)
                                return 0;
                        /* retry add_request for this subreq */
                        nfs_page_group_lock(req, false);
@@ -972,14 +1091,16 @@ err_ptr:
 static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        LIST_HEAD(head);
        do {
-                list_splice_init(&desc->pg_list, &head);
+                list_splice_init(&mirror->pg_list, &head);
-                desc->pg_bytes_written -= desc->pg_count;
+                mirror->pg_bytes_written -= mirror->pg_count;
-                desc->pg_count = 0;
+                mirror->pg_count = 0;
-                desc->pg_base = 0;
+                mirror->pg_base = 0;
-                desc->pg_recoalesce = 0;
+                mirror->pg_recoalesce = 0;
                desc->pg_moreio = 0;
                while (!list_empty(&head)) {
@@ -993,11 +1114,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
                                return 0;
                        break;
                }
-        } while (desc->pg_recoalesce);
+        } while (mirror->pg_recoalesce);
        return 1;
 }
-int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
                struct nfs_page *req)
 {
        int ret;
@@ -1010,9 +1131,80 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        break;
                ret = nfs_do_recoalesce(desc);
        } while (ret);
        return ret;
 }
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+                           struct nfs_page *req)
+{
+        u32 midx;
+        unsigned int pgbase, offset, bytes;
+        struct nfs_page *dupreq, *lastreq;
+        pgbase = req->wb_pgbase;
+        offset = req->wb_offset;
+        bytes = req->wb_bytes;
+        nfs_pageio_setup_mirroring(desc, req);
+        for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+                if (midx) {
+                        nfs_page_group_lock(req, false);
+                        /* find the last request */
+                        for (lastreq = req->wb_head;
+                             lastreq->wb_this_page != req->wb_head;
+                             lastreq = lastreq->wb_this_page)
+                                ;
+                        dupreq = nfs_create_request(req->wb_context,
+                                        req->wb_page, lastreq, pgbase, bytes);
+                        if (IS_ERR(dupreq)) {
+                                nfs_page_group_unlock(req);
+                                return 0;
+                        }
+                        nfs_lock_request(dupreq);
+                        nfs_page_group_unlock(req);
+                        dupreq->wb_offset = offset;
+                        dupreq->wb_index = req->wb_index;
+                } else
+                        dupreq = req;
+                if (nfs_pgio_has_mirroring(desc))
+                        desc->pg_mirror_idx = midx;
+                if (!nfs_pageio_add_request_mirror(desc, dupreq))
+                        return 0;
+        }
+        return 1;
+}
+/*
+ * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
+ *                              nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
+                                       u32 mirror_idx)
+{
+        struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
+        u32 restore_idx = desc->pg_mirror_idx;
+        if (nfs_pgio_has_mirroring(desc))
+                desc->pg_mirror_idx = mirror_idx;
+        for (;;) {
+                nfs_pageio_doio(desc);
+                if (!mirror->pg_recoalesce)
+                        break;
+                if (!nfs_do_recoalesce(desc))
+                        break;
+        }
+        desc->pg_mirror_idx = restore_idx;
+}
 /*
 * nfs_pageio_resend - Transfer requests to new descriptor and resend
 * @hdr - the pgio header to move request from
@@ -1046,18 +1238,19 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 EXPORT_SYMBOL_GPL(nfs_pageio_resend);
 /**
- * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
 * @desc: pointer to io descriptor
 */
 void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 {
-        for (;;) {
+        u32 midx;
-                nfs_pageio_doio(desc);
-                if (!desc->pg_recoalesce)
+        for (midx = 0; midx < desc->pg_mirror_count; midx++)
-                        break;
+                nfs_pageio_complete_mirror(desc, midx);
-                if (!nfs_do_recoalesce(desc))
-                        break;
+        if (desc->pg_ops->pg_cleanup)
-        }
+                desc->pg_ops->pg_cleanup(desc);
+        nfs_pageio_cleanup_mirroring(desc);
 }
 /**
@@ -1073,10 +1266,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 */
 void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
 {
-        if (!list_empty(&desc->pg_list)) {
+        struct nfs_pgio_mirror *mirror;
-                struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
+        struct nfs_page *prev;
-                if (index != prev->wb_index + 1)
+        u32 midx;
-                        nfs_pageio_complete(desc);
+        for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+                mirror = &desc->pg_mirrors[midx];
+                if (!list_empty(&mirror->pg_list)) {
+                        prev = nfs_list_entry(mirror->pg_list.prev);
+                        if (index != prev->wb_index + 1)
+                                nfs_pageio_complete_mirror(desc, midx);
+                }
        }
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0a5dda4d85c2..4f802b02fbb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -34,6 +34,7 @@
 #include "pnfs.h"
 #include "iostat.h"
 #include "nfs4trace.h"
+#include "delegation.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
 */
 static LIST_HEAD(pnfs_modules_tbl);
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+                       enum pnfs_iomode iomode, bool sync);
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
 find_pnfs_driver_locked(u32 id)
@@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
        struct inode *inode = lo->plh_inode;
        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                if (!list_empty(&lo->plh_segs))
+                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
                pnfs_detach_layout_hdr(lo);
                spin_unlock(&inode->i_lock);
                pnfs_free_layout_hdr(lo);
@@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
+                        struct pnfs_layout_segment *lseg)
+{
+        struct pnfs_layout_segment *s;
+        if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+                return false;
+        list_for_each_entry(s, &lo->plh_segs, pls_list)
+                if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+                        return false;
+        return true;
+}
+static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
+                struct pnfs_layout_hdr *lo, struct inode *inode)
+{
+        lo = lseg->pls_layout;
+        inode = lo->plh_inode;
+        spin_lock(&inode->i_lock);
+        if (pnfs_layout_need_return(lo, lseg)) {
+                nfs4_stateid stateid;
+                enum pnfs_iomode iomode;
+                stateid = lo->plh_stateid;
+                iomode = lo->plh_return_iomode;
+                /* decreased in pnfs_send_layoutreturn() */
+                lo->plh_block_lgets++;
+                lo->plh_return_iomode = 0;
+                spin_unlock(&inode->i_lock);
+                pnfs_get_layout_hdr(lo);
+                /* Send an async layoutreturn so we dont deadlock */
+                pnfs_send_layoutreturn(lo, stateid, iomode, false);
+        } else
+                spin_unlock(&inode->i_lock);
+}
 void
 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        /* Handle the case where refcount != 1 */
+        if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
+                return;
        lo = lseg->pls_layout;
        inode = lo->plh_inode;
+        /* Do we need a layoutreturn? */
+        if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+                pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
                pnfs_get_layout_hdr(lo);
                pnfs_layout_remove_lseg(lo, lseg);
@@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
                pnfs_get_layout_hdr(lo);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+                pnfs_clear_retry_layoutget(lo);
                spin_unlock(&nfsi->vfs_inode.i_lock);
                pnfs_free_lseg_list(&tmp_list);
                pnfs_put_layout_hdr(lo);
@@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 }
+static bool
+pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
+                      struct pnfs_layout_range *range)
+{
+        return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+                (lo->plh_return_iomode == IOMODE_ANY ||
+                 lo->plh_return_iomode == range->iomode);
+}
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
+                        struct pnfs_layout_range *range, int lget)
 {
        return lo->plh_block_lgets ||
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
                (list_empty(&lo->plh_segs) &&
-                 (atomic_read(&lo->plh_outstanding) > lget));
+                 (atomic_read(&lo->plh_outstanding) > lget)) ||
+                pnfs_layout_returning(lo, range);
 }
 int
 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                              struct pnfs_layout_range *range,
                              struct nfs4_state *open_state)
 {
        int status = 0;
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-        if (pnfs_layoutgets_blocked(lo, 1)) {
+        if (pnfs_layoutgets_blocked(lo, range, 1)) {
                status = -EAGAIN;
        } else if (!nfs4_valid_open_stateid(open_state)) {
                status = -EBADF;
@@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
                        pnfs_layout_io_set_failed(lo, range->iomode);
                }
                return NULL;
-        }
+        } else
+                pnfs_layout_clear_fail_bit(lo,
+                                pnfs_iomode_to_fail_bit(range->iomode));
        return lseg;
 }
@@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
        }
 }
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+        clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+        smp_mb__after_atomic();
+        wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+}
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+                       enum pnfs_iomode iomode, bool sync)
+{
+        struct inode *ino = lo->plh_inode;
+        struct nfs4_layoutreturn *lrp;
+        int status = 0;
+        lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+        if (unlikely(lrp == NULL)) {
+                status = -ENOMEM;
+                spin_lock(&ino->i_lock);
+                lo->plh_block_lgets--;
+                pnfs_clear_layoutreturn_waitbit(lo);
+                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+                spin_unlock(&ino->i_lock);
+                pnfs_put_layout_hdr(lo);
+                goto out;
+        }
+        lrp->args.stateid = stateid;
+        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+        lrp->args.inode = ino;
+        lrp->args.range.iomode = iomode;
+        lrp->args.range.offset = 0;
+        lrp->args.range.length = NFS4_MAX_UINT64;
+        lrp->args.layout = lo;
+        lrp->clp = NFS_SERVER(ino)->nfs_client;
+        lrp->cred = lo->plh_lc_cred;
+        status = nfs4_proc_layoutreturn(lrp, sync);
+out:
+        dprintk("<-- %s status: %d\n", __func__, status);
+        return status;
+}
 /*
 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
 * when the layout segment list is empty.
@@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino)
        struct pnfs_layout_hdr *lo = NULL;
        struct nfs_inode *nfsi = NFS_I(ino);
        LIST_HEAD(tmp_list);
-        struct nfs4_layoutreturn *lrp;
        nfs4_stateid stateid;
        int status = 0, empty;
@@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino)
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
-        lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+        status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
-        if (unlikely(lrp == NULL)) {
-                status = -ENOMEM;
-                spin_lock(&ino->i_lock);
-                lo->plh_block_lgets--;
-                spin_unlock(&ino->i_lock);
-                pnfs_put_layout_hdr(lo);
-                goto out;
-        }
-        lrp->args.stateid = stateid;
-        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
-        lrp->args.inode = ino;
-        lrp->args.layout = lo;
-        lrp->clp = NFS_SERVER(ino)->nfs_client;
-        lrp->cred = lo->plh_lc_cred;
-        status = nfs4_proc_layoutreturn(lrp);
 out:
        dprintk("<-- %s status: %d\n", __func__, status);
        return status;
@@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode)
 bool pnfs_roc(struct inode *ino)
 {
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_open_context *ctx;
+        struct nfs4_state *state;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg, *tmp;
+        nfs4_stateid stateid;
        LIST_HEAD(tmp_list);
-        bool found = false;
+        bool found = false, layoutreturn = false;
        spin_lock(&ino->i_lock);
-        lo = NFS_I(ino)->layout;
+        lo = nfsi->layout;
        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
-                goto out_nolayout;
+                goto out_noroc;
+        /* Don't return layout if we hold a delegation */
+        if (nfs4_check_delegation(ino, FMODE_READ))
+                goto out_noroc;
+        list_for_each_entry(ctx, &nfsi->open_files, list) {
+                state = ctx->state;
+                /* Don't return layout if there is open file state */
+                if (state != NULL && state->state != 0)
+                        goto out_noroc;
+        }
+        pnfs_clear_retry_layoutget(lo);
        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
                        mark_lseg_invalid(lseg, &tmp_list);
                        found = true;
                }
        if (!found)
-                goto out_nolayout;
+                goto out_noroc;
        lo->plh_block_lgets++;
        pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
        return true;
-out_nolayout:
+out_noroc:
+        if (lo) {
+                stateid = lo->plh_stateid;
+                layoutreturn =
+                        test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                           &lo->plh_flags);
+                if (layoutreturn) {
+                        lo->plh_block_lgets++;
+                        pnfs_get_layout_hdr(lo);
+                }
+        }
        spin_unlock(&ino->i_lock);
+        if (layoutreturn)
+                pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
        return false;
 }
@@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
        struct nfs_inode *nfsi = NFS_I(ino);
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg;
+        nfs4_stateid stateid;
        u32 current_seqid;
-        bool found = false;
+        bool found = false, layoutreturn = false;
        spin_lock(&ino->i_lock);
        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
@@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
         */
        *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
 out:
+        if (!found) {
+                stateid = lo->plh_stateid;
+                layoutreturn =
+                        test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                           &lo->plh_flags);
+                if (layoutreturn) {
+                        lo->plh_block_lgets++;
+                        pnfs_get_layout_hdr(lo);
+                }
+        }
        spin_unlock(&ino->i_lock);
+        if (layoutreturn) {
+                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+                pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
+        }
        return found;
 }
@@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+                    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
                    pnfs_lseg_range_match(&lseg->pls_range, range)) {
                        ret = pnfs_get_lseg(lseg);
                        break;
@@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
        return ret;
 }
+/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
+static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
+{
+        if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
+                return 1;
+        return nfs_wait_bit_killable(key);
+}
+static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        /*
+         * send layoutcommit as it can hold up layoutreturn due to lseg
+         * reference
+         */
+        pnfs_layoutcommit_inode(lo->plh_inode, false);
+        return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
+                                   pnfs_layoutget_retry_bit_wait,
+                                   TASK_UNINTERRUPTIBLE);
+}
+static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+        unsigned long *bitlock = &lo->plh_flags;
+        clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
+        smp_mb__after_atomic();
+        wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
+}
 /*
 * Layout segment is retreived from the server if not cached.
 * The appropriate layout segment is referenced and returned to the caller.
@@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino,
        if (pnfs_within_mdsthreshold(ctx, ino, iomode))
                goto out;
+lookup_again:
+        first = false;
        spin_lock(&ino->i_lock);
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
@@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino,
        }
        /* if LAYOUTGET already failed once we don't try again */
-        if (pnfs_layout_io_test_failed(lo, iomode))
+        if (pnfs_layout_io_test_failed(lo, iomode) &&
+            !pnfs_should_retry_layoutget(lo))
                goto out_unlock;
-        /* Check to see if the layout for the given range already exists */
+        first = list_empty(&lo->plh_segs);
-        lseg = pnfs_find_lseg(lo, &arg);
+        if (first) {
-        if (lseg)
+                /* The first layoutget for the file. Need to serialize per
-                goto out_unlock;
+                 * RFC 5661 Errata 3208.
+                 */
+                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
+                                     &lo->plh_flags)) {
+                        spin_unlock(&ino->i_lock);
+                        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
+                                    TASK_UNINTERRUPTIBLE);
+                        pnfs_put_layout_hdr(lo);
+                        goto lookup_again;
+                }
+        } else {
+                /* Check to see if the layout for the given range
+                 * already exists
+                 */
+                lseg = pnfs_find_lseg(lo, &arg);
+                if (lseg)
+                        goto out_unlock;
+        }
+        /*
+         * Because we free lsegs before sending LAYOUTRETURN, we need to wait
+         * for LAYOUTRETURN even if first is true.
+         */
+        if (!lseg && pnfs_should_retry_layoutget(lo) &&
+            test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+                spin_unlock(&ino->i_lock);
+                dprintk("%s wait for layoutreturn\n", __func__);
+                if (pnfs_prepare_to_retry_layoutget(lo)) {
+                        if (first)
+                                pnfs_clear_first_layoutget(lo);
+                        pnfs_put_layout_hdr(lo);
+                        dprintk("%s retrying\n", __func__);
+                        goto lookup_again;
+                }
+                goto out_put_layout_hdr;
+        }
-        if (pnfs_layoutgets_blocked(lo, 0))
+        if (pnfs_layoutgets_blocked(lo, &arg, 0))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
-        first = list_empty(&lo->plh_layouts) ? true : false;
        spin_unlock(&ino->i_lock);
-        if (first) {
+        if (list_empty(&lo->plh_layouts)) {
                /* The lo must be on the clp list if there is any
                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
                 */
                spin_lock(&clp->cl_lock);
-                list_add_tail(&lo->plh_layouts, &server->layouts);
+                if (list_empty(&lo->plh_layouts))
+                        list_add_tail(&lo->plh_layouts, &server->layouts);
                spin_unlock(&clp->cl_lock);
        }
@@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino,
                arg.length = PAGE_CACHE_ALIGN(arg.length);
        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
+        pnfs_clear_retry_layoutget(lo);
        atomic_dec(&lo->plh_outstanding);
 out_put_layout_hdr:
+        if (first)
+                pnfs_clear_first_layoutget(lo);
        pnfs_put_layout_hdr(lo);
 out:
        dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                goto out_forget_reply;
        }
-        if (pnfs_layoutgets_blocked(lo, 1)) {
+        if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
                dprintk("%s forget reply due to state\n", __func__);
                goto out_forget_reply;
        }
@@ -1440,24 +1652,79 @@ out_forget_reply:
        goto out;
 }
+static void
+pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+                                struct list_head *tmp_list,
+                                struct pnfs_layout_range *return_range)
+{
+        struct pnfs_layout_segment *lseg, *next;
+        dprintk("%s:Begin lo %p\n", __func__, lo);
+        if (list_empty(&lo->plh_segs))
+                return;
+        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+                if (should_free_lseg(&lseg->pls_range, return_range)) {
+                        dprintk("%s: marking lseg %p iomode %d "
+                                "offset %llu length %llu\n", __func__,
+                                lseg, lseg->pls_range.iomode,
+                                lseg->pls_range.offset,
+                                lseg->pls_range.length);
+                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+                        mark_lseg_invalid(lseg, tmp_list);
+                }
+}
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+                                       struct pnfs_layout_segment *lseg)
+{
+        struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
+        int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
+        struct pnfs_layout_range range = {
+                .iomode = lseg->pls_range.iomode,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        LIST_HEAD(free_me);
+        spin_lock(&inode->i_lock);
+        /* set failure bit so that pnfs path will be retried later */
+        pnfs_layout_set_fail_bit(lo, iomode);
+        set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+        if (lo->plh_return_iomode == 0)
+                lo->plh_return_iomode = range.iomode;
+        else if (lo->plh_return_iomode != range.iomode)
+                lo->plh_return_iomode = IOMODE_ANY;
+        /*
+         * mark all matching lsegs so that we are sure to have no live
+         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
+         * for how it works.
+         */
+        pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
+        spin_unlock(&inode->i_lock);
+        pnfs_free_lseg_list(&free_me);
+}
+EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
 void
 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
        u64 rd_size = req->wb_bytes;
-        WARN_ON_ONCE(pgio->pg_lseg != NULL);
+        if (pgio->pg_lseg == NULL) {
+                if (pgio->pg_dreq == NULL)
-        if (pgio->pg_dreq == NULL)
+                        rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
-                rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+                else
-        else
+                        rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-                rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
-                                           req->wb_context,
+                                                   req_offset(req),
-                                           req_offset(req),
+                                                   rd_size,
-                                           rd_size,
+                                                   IOMODE_READ,
-                                           IOMODE_READ,
+                                                   GFP_KERNEL);
-                                           GFP_KERNEL);
+        }
        /* If no lseg, fall back to read through mds */
        if (pgio->pg_lseg == NULL)
                nfs_pageio_reset_read_mds(pgio);
@@ -1469,27 +1736,36 @@ void
 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                           struct nfs_page *req, u64 wb_size)
 {
-        WARN_ON_ONCE(pgio->pg_lseg != NULL);
+        if (pgio->pg_lseg == NULL)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
-                                           req->wb_context,
+                                                   req_offset(req),
-                                           req_offset(req),
+                                                   wb_size,
-                                           wb_size,
+                                                   IOMODE_RW,
-                                           IOMODE_RW,
+                                                   GFP_NOFS);
-                                           GFP_NOFS);
        /* If no lseg, fall back to write through mds */
        if (pgio->pg_lseg == NULL)
                nfs_pageio_reset_write_mds(pgio);
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
+void
+pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+        if (desc->pg_lseg) {
+                pnfs_put_lseg(desc->pg_lseg);
+                desc->pg_lseg = NULL;
+        }
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
 /*
 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
 * of bytes (maximum @req->wb_bytes) that can be coalesced.
 */
 size_t
-pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
-                     struct nfs_page *req)
+                     struct nfs_page *prev, struct nfs_page *req)
 {
        unsigned int size;
        u64 seg_end, req_start, seg_left;
@@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
                                     pgio->pg_lseg->pls_range.length);
                req_start = req_offset(req);
-                WARN_ON_ONCE(req_start > seg_end);
+                WARN_ON_ONCE(req_start >= seg_end);
                /* start of request is past the last byte of this segment */
-                if (req_start >= seg_end)
+                if (req_start >= seg_end) {
+                        /* reference the new lseg */
+                        if (pgio->pg_ops->pg_cleanup)
+                                pgio->pg_ops->pg_cleanup(pgio);
+                        if (pgio->pg_ops->pg_init)
+                                pgio->pg_ops->pg_init(pgio, req);
                        return 0;
+                }
                /* adjust 'size' iff there are fewer bytes left in the
                 * segment than what nfs_generic_pg_test returned */
@@ -1571,10 +1853,12 @@ static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
                struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
-                list_splice_tail_init(&hdr->pages, &desc->pg_list);
+                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
                nfs_pageio_reset_write_mds(desc);
-                desc->pg_recoalesce = 1;
+                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
 }
@@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
-        desc->pg_lseg = NULL;
        trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
                pnfs_write_through_mds(desc, hdr);
-        pnfs_put_lseg(lseg);
 }
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_pgio_header *hdr;
        int ret;
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
                return -ENOMEM;
        }
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        ret = nfs_generic_pgio(desc, hdr);
-        if (ret != 0) {
+        if (!ret)
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
-        } else
                pnfs_do_write(desc, hdr, desc->pg_ioflags);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1687,10 +1968,12 @@ static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
                struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
-                list_splice_tail_init(&hdr->pages, &desc->pg_list);
+                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
                nfs_pageio_reset_read_mds(desc);
-                desc->pg_recoalesce = 1;
+                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
 }
@@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
        return trypnfs;
 }
+/* Resend all requests through pnfs. */
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+{
+        struct nfs_pageio_descriptor pgio;
+        nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
+        return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
 static void
 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
 {
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
+        int err = 0;
-        desc->pg_lseg = NULL;
        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
-        if (trypnfs == PNFS_NOT_ATTEMPTED)
+        if (trypnfs == PNFS_TRY_AGAIN)
+                err = pnfs_read_resend_pnfs(hdr);
+        if (trypnfs == PNFS_NOT_ATTEMPTED || err)
                pnfs_read_through_mds(desc, hdr);
-        pnfs_put_lseg(lseg);
 }
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_pgio_header *hdr;
        int ret;
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-                ret = -ENOMEM;
+                return -ENOMEM;
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
-                return ret;
        }
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        ret = nfs_generic_pgio(desc, hdr);
-        if (ret != 0) {
+        if (!ret)
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
-        } else
                pnfs_do_read(desc, hdr);
        return ret;
 }
@@ -1966,6 +2256,7 @@ clear_layoutcommitting:
        pnfs_clear_layoutcommitting(inode);
        goto out;
 }
+EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9ae5b765b073..797cd6253adf 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -38,6 +38,25 @@ enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
        NFS_LSEG_LAYOUTCOMMIT,  /* layoutcommit bit set for layoutcommit */
+        NFS_LSEG_LAYOUTRETURN,  /* layoutreturn bit set for layoutreturn */
+};
+/* Individual ip address */
+struct nfs4_pnfs_ds_addr {
+        struct sockaddr_storage da_addr;
+        size_t                  da_addrlen;
+        struct list_head        da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        char                    *da_remotestr;  /* human readable addr+port */
+};
+struct nfs4_pnfs_ds {
+        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        char                    *ds_remotestr;  /* comma sep list of addrs */
+        struct list_head        ds_addrs;
+        struct nfs_client       *ds_clp;
+        atomic_t                ds_count;
+        unsigned long           ds_state;
+#define NFS4DS_CONNECTING       0       /* ds is establishing connection */
 };
 struct pnfs_layout_segment {
@@ -53,19 +72,34 @@ struct pnfs_layout_segment {
 enum pnfs_try_status {
        PNFS_ATTEMPTED     = 0,
        PNFS_NOT_ATTEMPTED = 1,
+        PNFS_TRY_AGAIN     = 2,
 };
 #ifdef CONFIG_NFS_V4_1
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+/*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module parameters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */
+#define NFS4_DEF_DS_RETRANS 5
+/* error codes for internal use */
+#define NFS4ERR_RESET_TO_MDS   12001
+#define NFS4ERR_RESET_TO_PNFS  12002
 enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
        NFS_LAYOUT_RETURN,              /* Return this layout ASAP */
+        NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
+        NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget */
+        NFS_LAYOUT_RETRY_LAYOUTGET,     /* Retry layoutget */
 };
 enum layoutdriver_policy_flags {
@@ -106,7 +140,8 @@ struct pnfs_layoutdriver_type {
        struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
        void (*mark_request_commit) (struct nfs_page *req,
                                     struct pnfs_layout_segment *lseg,
-                                     struct nfs_commit_info *cinfo);
+                                     struct nfs_commit_info *cinfo,
+                                     u32 ds_commit_idx);
        void (*clear_request_commit) (struct nfs_page *req,
                                      struct nfs_commit_info *cinfo);
        int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
@@ -154,6 +189,7 @@ struct pnfs_layout_hdr {
        u32                     plh_barrier; /* ignore lower seqids */
        unsigned long           plh_retry_timestamp;
        unsigned long           plh_flags;
+        enum pnfs_iomode        plh_return_iomode;
        loff_t                  plh_lwb; /* last write byte for layoutcommit */
        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
        struct inode            *plh_inode;
@@ -185,7 +221,7 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev,
                                   struct rpc_cred *cred);
 extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
-extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 /* pnfs.c */
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -198,6 +234,7 @@ void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                struct nfs_page *req, u64 wb_size);
+void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
                            struct nfs_page *prev, struct nfs_page *req);
@@ -217,6 +254,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             bool update_barrier);
 int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
                                  struct pnfs_layout_hdr *lo,
+                                  struct pnfs_layout_range *range,
                                  struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
@@ -233,17 +271,21 @@ int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
 void pnfs_ld_write_done(struct nfs_pgio_header *);
 void pnfs_ld_read_done(struct nfs_pgio_header *);
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
                                               u64 count,
                                               enum pnfs_iomode iomode,
                                               gfp_t gfp_flags);
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+                                       struct pnfs_layout_segment *lseg);
 /* nfs4_deviceid_flags */
 enum {
@@ -275,6 +317,39 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
 bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
+/* pnfs_nfs.c */
+void pnfs_generic_clear_request_commit(struct nfs_page *req,
+                                       struct nfs_commit_info *cinfo);
+void pnfs_generic_commit_release(void *calldata);
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
+void pnfs_generic_rw_release(void *data);
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+                                      struct nfs_commit_info *cinfo);
+int pnfs_generic_commit_pagelist(struct inode *inode,
+                                 struct list_head *mds_pages,
+                                 int how,
+                                 struct nfs_commit_info *cinfo,
+                                 int (*initiate_commit)(struct nfs_commit_data *data,
+                                                        int how));
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+                                      gfp_t gfp_flags);
+void nfs4_pnfs_v3_ds_connect_unload(void);
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+                          struct nfs4_deviceid_node *devid, unsigned int timeo,
+                          unsigned int retrans, u32 version, u32 minor_version,
+                          rpc_authflavor_t au_flavor);
+struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
+                                                 struct xdr_stream *xdr,
+                                                 gfp_t gfp_flags);
+static inline bool nfs_have_layout(struct inode *inode)
+{
+        return NFS_I(inode)->layout != NULL;
+}
 static inline struct nfs4_deviceid_node *
 nfs4_get_deviceid(struct nfs4_deviceid_node *d)
 {
@@ -282,6 +357,26 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
        return d;
 }
+static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
+                atomic_inc(&lo->plh_refcount);
+}
+static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
+                atomic_dec(&lo->plh_refcount);
+                /* wake up waiters for LAYOUTRETURN as that is not needed */
+                wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+        }
+}
+static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
+}
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -317,16 +412,22 @@ pnfs_get_ds_info(struct inode *inode)
        return ld->get_ds_info(inode);
 }
+static inline void
+pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+        set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
 static inline bool
 pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-                         struct nfs_commit_info *cinfo)
+                         struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
        struct inode *inode = req->wb_context->dentry->d_inode;
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
        if (lseg == NULL || ld->mark_request_commit == NULL)
                return false;
-        ld->mark_request_commit(req, lseg, cinfo);
+        ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
        return true;
 }
@@ -352,15 +453,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
                return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
 }
-static inline void
-pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
-                         struct nfs_commit_info *cinfo)
-{
-        if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
-                return;
-        NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
-}
 static inline struct nfs_page *
 pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
                        struct page *page)
@@ -427,6 +519,11 @@ static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
 #endif /* NFS_DEBUG */
 #else  /* CONFIG_NFS_V4_1 */
+static inline bool nfs_have_layout(struct inode *inode)
+{
+        return false;
+}
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
 {
 }
@@ -513,7 +610,7 @@ pnfs_get_ds_info(struct inode *inode)
 static inline bool
 pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-                         struct nfs_commit_info *cinfo)
+                         struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
        return false;
 }
@@ -531,12 +628,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
        return 0;
 }
-static inline void
-pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
-                         struct nfs_commit_info *cinfo)
-{
-}
 static inline struct nfs_page *
 pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
                        struct page *page)
@@ -568,6 +659,10 @@ static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
        return NULL;
 }
+static inline void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000000..fdc4f6562bb7
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,840 @@
+/*
+ * Common NFS I/O  operations for the pnfs file based
+ * layout drivers.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tom Haynes <loghyr@primarydata.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/module.h>
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS
+void pnfs_generic_rw_release(void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        nfs_put_client(hdr->ds_clp);
+        hdr->mds_ops->rpc_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
+{
+        struct nfs_page *first = nfs_list_entry(data->pages.next);
+        data->task.tk_status = 0;
+        memcpy(&data->verf.verifier, &first->wb_verf,
+               sizeof(data->verf.verifier));
+        data->verf.verifier.data[0]++; /* ensure verifier mismatch */
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
+{
+        struct nfs_commit_data *wdata = data;
+        /* Note this may cause RPC to be resent */
+        wdata->mds_ops->rpc_call_done(task, data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
+void pnfs_generic_commit_release(void *calldata)
+{
+        struct nfs_commit_data *data = calldata;
+        data->completion_ops->completion(data);
+        pnfs_put_lseg(data->lseg);
+        nfs_put_client(data->ds_clp);
+        nfs_commitdata_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this must be called holding the inode (/cinfo) lock
+ */
+void
+pnfs_generic_clear_request_commit(struct nfs_page *req,
+                                  struct nfs_commit_info *cinfo)
+{
+        struct pnfs_layout_segment *freeme = NULL;
+        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+                goto out;
+        cinfo->ds->nwritten--;
+        if (list_is_singular(&req->wb_list)) {
+                struct pnfs_commit_bucket *bucket;
+                bucket = list_first_entry(&req->wb_list,
+                                          struct pnfs_commit_bucket,
+                                          written);
+                freeme = bucket->wlseg;
+                bucket->wlseg = NULL;
+        }
+out:
+        nfs_request_remove_commit_list(req, cinfo);
+        pnfs_put_lseg_locked(freeme);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+static int
+pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
+                                  struct nfs_commit_info *cinfo, int max)
+{
+        struct nfs_page *req, *tmp;
+        int ret = 0;
+        list_for_each_entry_safe(req, tmp, src, wb_list) {
+                if (!nfs_lock_request(req))
+                        continue;
+                kref_get(&req->wb_kref);
+                if (cond_resched_lock(cinfo->lock))
+                        list_safe_reset_next(req, tmp, wb_list);
+                nfs_request_remove_commit_list(req, cinfo);
+                clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+                nfs_list_add_request(req, dst);
+                ret++;
+                if ((ret == max) && !cinfo->dreq)
+                        break;
+        }
+        return ret;
+}
+static int
+pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+                                 struct nfs_commit_info *cinfo,
+                                 int max)
+{
+        struct list_head *src = &bucket->written;
+        struct list_head *dst = &bucket->committing;
+        int ret;
+        lockdep_assert_held(cinfo->lock);
+        ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
+        if (ret) {
+                cinfo->ds->nwritten -= ret;
+                cinfo->ds->ncommitting += ret;
+                bucket->clseg = bucket->wlseg;
+                if (list_empty(src))
+                        bucket->wlseg = NULL;
+                else
+                        pnfs_get_lseg(bucket->clseg);
+        }
+        return ret;
+}
+/* Move reqs from written to committing lists, returning count
+ * of number moved.
+ */
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
+                                   int max)
+{
+        int i, rv = 0, cnt;
+        lockdep_assert_held(cinfo->lock);
+        for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
+                cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
+                                                       cinfo, max);
+                max -= cnt;
+                rv += cnt;
+        }
+        return rv;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
+/* Pull everything off the committing lists and dump into @dst.  */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+                                      struct nfs_commit_info *cinfo)
+{
+        struct pnfs_commit_bucket *b;
+        struct pnfs_layout_segment *freeme;
+        int i;
+        lockdep_assert_held(cinfo->lock);
+restart:
+        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+                if (pnfs_generic_transfer_commit_list(&b->written, dst,
+                                                      cinfo, 0)) {
+                        freeme = b->wlseg;
+                        b->wlseg = NULL;
+                        spin_unlock(cinfo->lock);
+                        pnfs_put_lseg(freeme);
+                        spin_lock(cinfo->lock);
+                        goto restart;
+                }
+        }
+        cinfo->ds->nwritten = 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
+static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+        struct pnfs_commit_bucket *bucket;
+        struct pnfs_layout_segment *freeme;
+        int i;
+        for (i = idx; i < fl_cinfo->nbuckets; i++) {
+                bucket = &fl_cinfo->buckets[i];
+                if (list_empty(&bucket->committing))
+                        continue;
+                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
+                spin_lock(cinfo->lock);
+                freeme = bucket->clseg;
+                bucket->clseg = NULL;
+                spin_unlock(cinfo->lock);
+                pnfs_put_lseg(freeme);
+        }
+}
+static unsigned int
+pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
+                              struct list_head *list)
+{
+        struct pnfs_ds_commit_info *fl_cinfo;
+        struct pnfs_commit_bucket *bucket;
+        struct nfs_commit_data *data;
+        int i;
+        unsigned int nreq = 0;
+        fl_cinfo = cinfo->ds;
+        bucket = fl_cinfo->buckets;
+        for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+                if (list_empty(&bucket->committing))
+                        continue;
+                data = nfs_commitdata_alloc();
+                if (!data)
+                        break;
+                data->ds_commit_index = i;
+                spin_lock(cinfo->lock);
+                data->lseg = bucket->clseg;
+                bucket->clseg = NULL;
+                spin_unlock(cinfo->lock);
+                list_add(&data->pages, list);
+                nreq++;
+        }
+        /* Clean up on error */
+        pnfs_generic_retry_commit(cinfo, i);
+        return nreq;
+}
+/* This follows nfs_commit_list pretty closely */
+int
+pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+                             int how, struct nfs_commit_info *cinfo,
+                             int (*initiate_commit)(struct nfs_commit_data *data,
+                                                    int how))
+{
+        struct nfs_commit_data *data, *tmp;
+        LIST_HEAD(list);
+        unsigned int nreq = 0;
+        if (!list_empty(mds_pages)) {
+                data = nfs_commitdata_alloc();
+                if (data != NULL) {
+                        data->lseg = NULL;
+                        list_add(&data->pages, &list);
+                        nreq++;
+                } else {
+                        nfs_retry_commit(mds_pages, NULL, cinfo, 0);
+                        pnfs_generic_retry_commit(cinfo, 0);
+                        cinfo->completion_ops->error_cleanup(NFS_I(inode));
+                        return -ENOMEM;
+                }
+        }
+        nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
+        if (nreq == 0) {
+                cinfo->completion_ops->error_cleanup(NFS_I(inode));
+                goto out;
+        }
+        atomic_add(nreq, &cinfo->mds->rpcs_out);
+        list_for_each_entry_safe(data, tmp, &list, pages) {
+                list_del_init(&data->pages);
+                if (!data->lseg) {
+                        nfs_init_commit(data, mds_pages, NULL, cinfo);
+                        nfs_initiate_commit(NFS_CLIENT(inode), data,
+                                            NFS_PROTO(data->inode),
+                                            data->mds_ops, how, 0);
+                } else {
+                        struct pnfs_commit_bucket *buckets;
+                        buckets = cinfo->ds->buckets;
+                        nfs_init_commit(data,
+                                        &buckets[data->ds_commit_index].committing,
+                                        data->lseg,
+                                        cinfo);
+                        initiate_commit(data, how);
+                }
+        }
+out:
+        cinfo->ds->ncommitting = 0;
+        return PNFS_ATTEMPTED;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+/* Debug routines */
+static void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+        if (ds == NULL) {
+                printk(KERN_WARNING "%s NULL device\n", __func__);
+                return;
+        }
+        printk(KERN_WARNING "        ds %s\n"
+                "        ref count %d\n"
+                "        client %p\n"
+                "        cl_exchange_flags %x\n",
+                ds->ds_remotestr,
+                atomic_read(&ds->ds_count), ds->ds_clp,
+                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+static bool
+same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
+{
+        struct sockaddr_in *a, *b;
+        struct sockaddr_in6 *a6, *b6;
+        if (addr1->sa_family != addr2->sa_family)
+                return false;
+        switch (addr1->sa_family) {
+        case AF_INET:
+                a = (struct sockaddr_in *)addr1;
+                b = (struct sockaddr_in *)addr2;
+                if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+                    a->sin_port == b->sin_port)
+                        return true;
+                break;
+        case AF_INET6:
+                a6 = (struct sockaddr_in6 *)addr1;
+                b6 = (struct sockaddr_in6 *)addr2;
+                /* LINKLOCAL addresses must have matching scope_id */
+                if (ipv6_addr_src_scope(&a6->sin6_addr) ==
+                    IPV6_ADDR_SCOPE_LINKLOCAL &&
+                    a6->sin6_scope_id != b6->sin6_scope_id)
+                        return false;
+                if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+                    a6->sin6_port == b6->sin6_port)
+                        return true;
+                break;
+        default:
+                dprintk("%s: unhandled address family: %u\n",
+                        __func__, addr1->sa_family);
+                return false;
+        }
+        return false;
+}
+static bool
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+                               const struct list_head *dsaddrs2)
+{
+        struct nfs4_pnfs_ds_addr *da1, *da2;
+        /* step through both lists, comparing as we go */
+        for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
+             da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
+             da1 != NULL && da2 != NULL;
+             da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
+             da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
+                if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
+                                   (struct sockaddr *)&da2->da_addr))
+                        return false;
+        }
+        if (da1 == NULL && da2 == NULL)
+                return true;
+        return false;
+}
+/*
+ * Lookup DS by addresses.  nfs4_ds_cache_lock is held
+ */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(const struct list_head *dsaddrs)
+{
+        struct nfs4_pnfs_ds *ds;
+        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+                if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+                        return ds;
+        return NULL;
+}
+static void destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+        struct nfs4_pnfs_ds_addr *da;
+        dprintk("--> %s\n", __func__);
+        ifdebug(FACILITY)
+                print_ds(ds);
+        nfs_put_client(ds->ds_clp);
+        while (!list_empty(&ds->ds_addrs)) {
+                da = list_first_entry(&ds->ds_addrs,
+                                      struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
+        kfree(ds->ds_remotestr);
+        kfree(ds);
+}
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
+{
+        if (atomic_dec_and_lock(&ds->ds_count,
+                                &nfs4_ds_cache_lock)) {
+                list_del_init(&ds->ds_node);
+                spin_unlock(&nfs4_ds_cache_lock);
+                destroy_ds(ds);
+        }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
+/*
+ * Create a string with a human readable address and port to avoid
+ * complicated setup around many dprinks.
+ */
+static char *
+nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+        struct nfs4_pnfs_ds_addr *da;
+        char *remotestr;
+        size_t len;
+        char *p;
+        len = 3;        /* '{', '}' and eol */
+        list_for_each_entry(da, dsaddrs, da_node) {
+                len += strlen(da->da_remotestr) + 1;    /* string plus comma */
+        }
+        remotestr = kzalloc(len, gfp_flags);
+        if (!remotestr)
+                return NULL;
+        p = remotestr;
+        *(p++) = '{';
+        len--;
+        list_for_each_entry(da, dsaddrs, da_node) {
+                size_t ll = strlen(da->da_remotestr);
+                if (ll > len)
+                        goto out_err;
+                memcpy(p, da->da_remotestr, ll);
+                p += ll;
+                len -= ll;
+                if (len < 1)
+                        goto out_err;
+                (*p++) = ',';
+                len--;
+        }
+        if (len < 2)
+                goto out_err;
+        *(p++) = '}';
+        *p = '\0';
+        return remotestr;
+out_err:
+        kfree(remotestr);
+        return NULL;
+}
+/*
+ * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
+ * uncached and return cached struct nfs4_pnfs_ds.
+ */
+struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+        struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+        char *remotestr;
+        if (list_empty(dsaddrs)) {
+                dprintk("%s: no addresses defined\n", __func__);
+                goto out;
+        }
+        ds = kzalloc(sizeof(*ds), gfp_flags);
+        if (!ds)
+                goto out;
+        /* this is only used for debugging, so it's ok if its NULL */
+        remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
+        spin_lock(&nfs4_ds_cache_lock);
+        tmp_ds = _data_server_lookup_locked(dsaddrs);
+        if (tmp_ds == NULL) {
+                INIT_LIST_HEAD(&ds->ds_addrs);
+                list_splice_init(dsaddrs, &ds->ds_addrs);
+                ds->ds_remotestr = remotestr;
+                atomic_set(&ds->ds_count, 1);
+                INIT_LIST_HEAD(&ds->ds_node);
+                ds->ds_clp = NULL;
+                list_add(&ds->ds_node, &nfs4_data_server_cache);
+                dprintk("%s add new data server %s\n", __func__,
+                        ds->ds_remotestr);
+        } else {
+                kfree(remotestr);
+                kfree(ds);
+                atomic_inc(&tmp_ds->ds_count);
+                dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+                        __func__, tmp_ds->ds_remotestr,
+                        atomic_read(&tmp_ds->ds_count));
+                ds = tmp_ds;
+        }
+        spin_unlock(&nfs4_ds_cache_lock);
+out:
+        return ds;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
+static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+        might_sleep();
+        wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+                        TASK_KILLABLE);
+}
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+        smp_mb__before_atomic();
+        clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
+        smp_mb__after_atomic();
+        wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+}
+static struct nfs_client *(*get_v3_ds_connect)(
+                        struct nfs_client *mds_clp,
+                        const struct sockaddr *ds_addr,
+                        int ds_addrlen,
+                        int ds_proto,
+                        unsigned int ds_timeo,
+                        unsigned int ds_retrans,
+                        rpc_authflavor_t au_flavor);
+static bool load_v3_ds_connect(void)
+{
+        if (!get_v3_ds_connect) {
+                get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
+                WARN_ON_ONCE(!get_v3_ds_connect);
+        }
+        return(get_v3_ds_connect != NULL);
+}
+void __exit nfs4_pnfs_v3_ds_connect_unload(void)
+{
+        if (get_v3_ds_connect) {
+                symbol_put(nfs3_set_ds_client);
+                get_v3_ds_connect = NULL;
+        }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
+static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
+                                 struct nfs4_pnfs_ds *ds,
+                                 unsigned int timeo,
+                                 unsigned int retrans,
+                                 rpc_authflavor_t au_flavor)
+{
+        struct nfs_client *clp = ERR_PTR(-EIO);
+        struct nfs4_pnfs_ds_addr *da;
+        int status = 0;
+        dprintk("--> %s DS %s au_flavor %d\n", __func__,
+                ds->ds_remotestr, au_flavor);
+        if (!load_v3_ds_connect())
+                goto out;
+        list_for_each_entry(da, &ds->ds_addrs, da_node) {
+                dprintk("%s: DS %s: trying address %s\n",
+                        __func__, ds->ds_remotestr, da->da_remotestr);
+                clp = get_v3_ds_connect(mds_srv->nfs_client,
+                                        (struct sockaddr *)&da->da_addr,
+                                        da->da_addrlen, IPPROTO_TCP,
+                                        timeo, retrans, au_flavor);
+                if (!IS_ERR(clp))
+                        break;
+        }
+        if (IS_ERR(clp)) {
+                status = PTR_ERR(clp);
+                goto out;
+        }
+        smp_wmb();
+        ds->ds_clp = clp;
+        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+        return status;
+}
+static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
+                                 struct nfs4_pnfs_ds *ds,
+                                 unsigned int timeo,
+                                 unsigned int retrans,
+                                 u32 minor_version,
+                                 rpc_authflavor_t au_flavor)
+{
+        struct nfs_client *clp = ERR_PTR(-EIO);
+        struct nfs4_pnfs_ds_addr *da;
+        int status = 0;
+        dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
+                au_flavor);
+        list_for_each_entry(da, &ds->ds_addrs, da_node) {
+                dprintk("%s: DS %s: trying address %s\n",
+                        __func__, ds->ds_remotestr, da->da_remotestr);
+                clp = nfs4_set_ds_client(mds_srv->nfs_client,
+                                        (struct sockaddr *)&da->da_addr,
+                                        da->da_addrlen, IPPROTO_TCP,
+                                        timeo, retrans, minor_version,
+                                        au_flavor);
+                if (!IS_ERR(clp))
+                        break;
+        }
+        if (IS_ERR(clp)) {
+                status = PTR_ERR(clp);
+                goto out;
+        }
+        status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
+        if (status)
+                goto out_put;
+        smp_wmb();
+        ds->ds_clp = clp;
+        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+        return status;
+out_put:
+        nfs_put_client(clp);
+        goto out;
+}
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server.
+ * Currently only supports IPv4 and IPv6 addresses.
+ * If connection fails, make devid unavailable.
+ */
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+                          struct nfs4_deviceid_node *devid, unsigned int timeo,
+                          unsigned int retrans, u32 version,
+                          u32 minor_version, rpc_authflavor_t au_flavor)
+{
+        if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
+                int err = 0;
+                if (version == 3) {
+                        err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
+                                                       retrans, au_flavor);
+                } else if (version == 4) {
+                        err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
+                                                       retrans, minor_version,
+                                                       au_flavor);
+                } else {
+                        dprintk("%s: unsupported DS version %d\n", __func__,
+                                version);
+                        err = -EPROTONOSUPPORT;
+                }
+                if (err)
+                        nfs4_mark_deviceid_unavailable(devid);
+                nfs4_clear_ds_conn_bit(ds);
+        } else {
+                nfs4_wait_ds_connect(ds);
+        }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
+/*
+ * Currently only supports ipv4, ipv6 and one multi-path address.
+ */
+struct nfs4_pnfs_ds_addr *
+nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
+{
+        struct nfs4_pnfs_ds_addr *da = NULL;
+        char *buf, *portstr;
+        __be16 port;
+        int nlen, rlen;
+        int tmp[2];
+        __be32 *p;
+        char *netid, *match_netid;
+        size_t len, match_netid_len;
+        char *startsep = "";
+        char *endsep = "";
+        /* r_netid */
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_err;
+        nlen = be32_to_cpup(p++);
+        p = xdr_inline_decode(xdr, nlen);
+        if (unlikely(!p))
+                goto out_err;
+        netid = kmalloc(nlen+1, gfp_flags);
+        if (unlikely(!netid))
+                goto out_err;
+        netid[nlen] = '\0';
+        memcpy(netid, p, nlen);
+        /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_free_netid;
+        rlen = be32_to_cpup(p);
+        p = xdr_inline_decode(xdr, rlen);
+        if (unlikely(!p))
+                goto out_free_netid;
+        /* port is ".ABC.DEF", 8 chars max */
+        if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
+                dprintk("%s: Invalid address, length %d\n", __func__,
+                        rlen);
+                goto out_free_netid;
+        }
+        buf = kmalloc(rlen + 1, gfp_flags);
+        if (!buf) {
+                dprintk("%s: Not enough memory\n", __func__);
+                goto out_free_netid;
+        }
+        buf[rlen] = '\0';
+        memcpy(buf, p, rlen);
+        /* replace port '.' with '-' */
+        portstr = strrchr(buf, '.');
+        if (!portstr) {
+                dprintk("%s: Failed finding expected dot in port\n",
+                        __func__);
+                goto out_free_buf;
+        }
+        *portstr = '-';
+        /* find '.' between address and port */
+        portstr = strrchr(buf, '.');
+        if (!portstr) {
+                dprintk("%s: Failed finding expected dot between address and "
+                        "port\n", __func__);
+                goto out_free_buf;
+        }
+        *portstr = '\0';
+        da = kzalloc(sizeof(*da), gfp_flags);
+        if (unlikely(!da))
+                goto out_free_buf;
+        INIT_LIST_HEAD(&da->da_node);
+        if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+                      sizeof(da->da_addr))) {
+                dprintk("%s: error parsing address %s\n", __func__, buf);
+                goto out_free_da;
+        }
+        portstr++;
+        sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
+        port = htons((tmp[0] << 8) | (tmp[1]));
+        switch (da->da_addr.ss_family) {
+        case AF_INET:
+                ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+                da->da_addrlen = sizeof(struct sockaddr_in);
+                match_netid = "tcp";
+                match_netid_len = 3;
+                break;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+                da->da_addrlen = sizeof(struct sockaddr_in6);
+                match_netid = "tcp6";
+                match_netid_len = 4;
+                startsep = "[";
+                endsep = "]";
+                break;
+        default:
+                dprintk("%s: unsupported address family: %u\n",
+                        __func__, da->da_addr.ss_family);
+                goto out_free_da;
+        }
+        if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
+                dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
+                        __func__, netid, match_netid);
+                goto out_free_da;
+        }
+        /* save human readable address */
+        len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+        da->da_remotestr = kzalloc(len, gfp_flags);
+        /* NULL is ok, only used for dprintk */
+        if (da->da_remotestr)
+                snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+                         buf, endsep, ntohs(port));
+        dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
+        kfree(buf);
+        kfree(netid);
+        return da;
+out_free_da:
+        kfree(da);
+out_free_buf:
+        dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+        kfree(buf);
+out_free_netid:
+        kfree(netid);
+out_err:
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c91a4799c562..568ecf0a880f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 {
+        struct nfs_pgio_mirror *mirror;
        pgio->pg_ops = &nfs_pgio_rw_ops;
-        pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+        /* read path should never have more than one mirror */
+        WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+        mirror = &pgio->pg_mirrors[0];
+        mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
@@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        struct nfs_page *new;
        unsigned int len;
        struct nfs_pageio_descriptor pgio;
+        struct nfs_pgio_mirror *pgm;
        len = nfs_page_length(page);
        if (len == 0)
@@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                             &nfs_async_read_completion_ops);
        nfs_pageio_add_request(&pgio, new);
        nfs_pageio_complete(&pgio);
-        NFS_I(inode)->read_io += pgio.pg_bytes_written;
+        /* It doesn't make sense to do mirrored reads! */
+        WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+        pgm = &pgio.pg_mirrors[0];
+        NFS_I(inode)->read_io += pgm->pg_bytes_written;
        return 0;
 }
@@ -168,13 +182,14 @@ out:
 static void nfs_initiate_read(struct nfs_pgio_header *hdr,
                              struct rpc_message *msg,
+                              const struct nfs_rpc_ops *rpc_ops,
                              struct rpc_task_setup *task_setup_data, int how)
 {
        struct inode *inode = hdr->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
        task_setup_data->flags |= swap_flags;
-        NFS_PROTO(inode)->read_setup(hdr, msg);
+        rpc_ops->read_setup(hdr, msg);
 }
 static void
@@ -351,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                struct list_head *pages, unsigned nr_pages)
 {
        struct nfs_pageio_descriptor pgio;
+        struct nfs_pgio_mirror *pgm;
        struct nfs_readdesc desc = {
                .pgio = &pgio,
        };
@@ -386,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                             &nfs_async_read_completion_ops);
        ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
        nfs_pageio_complete(&pgio);
-        NFS_I(inode)->read_io += pgio.pg_bytes_written;
-        npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /* It doesn't make sense to do mirrored reads! */
+        WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+        pgm = &pgio.pg_mirrors[0];
+        NFS_I(inode)->read_io += pgm->pg_bytes_written;
+        npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
+                 PAGE_CACHE_SHIFT;
        nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
        put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 31a11b0e885d..322b2de02988 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -311,7 +311,6 @@ const struct super_operations nfs_sops = {
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
        .drop_inode     = nfs_drop_inode,
-        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
@@ -405,12 +404,15 @@ void __exit unregister_nfs_fs(void)
        unregister_filesystem(&nfs_fs_type);
 }
-void nfs_sb_active(struct super_block *sb)
+bool nfs_sb_active(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
-        if (atomic_inc_return(&server->active) == 1)
+        if (!atomic_inc_not_zero(&sb->s_active))
-                atomic_inc(&sb->s_active);
+                return false;
+        if (atomic_inc_return(&server->active) != 1)
+                atomic_dec(&sb->s_active);
+        return true;
 }
 EXPORT_SYMBOL_GPL(nfs_sb_active);
@@ -2569,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
                error = nfs_bdi_register(server);
                if (error) {
                        mntroot = ERR_PTR(error);
-                        goto error_splat_bdi;
+                        goto error_splat_super;
                }
                server->super = s;
        }
@@ -2601,9 +2603,6 @@ error_splat_root:
        dput(mntroot);
        mntroot = ERR_PTR(error);
 error_splat_super:
-        if (server && !s->s_root)
-                bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
 }
@@ -2651,27 +2650,19 @@ out:
 EXPORT_SYMBOL_GPL(nfs_fs_mount);
 /*
- * Ensure that we unregister the bdi before kill_anon_super
- * releases the device name
- */
-void nfs_put_super(struct super_block *s)
-{
-        struct nfs_server *server = NFS_SB(s);
-        bdi_unregister(&server->backing_dev_info);
-}
-EXPORT_SYMBOL_GPL(nfs_put_super);
-/*
 * Destroy an NFS2/3 superblock
 */
 void nfs_kill_super(struct super_block *s)
 {
        struct nfs_server *server = NFS_SB(s);
+        dev_t dev = s->s_dev;
+        generic_shutdown_super(s);
-        kill_anon_super(s);
        nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
+        free_anon_bdev(dev);
 }
 EXPORT_SYMBOL_GPL(nfs_kill_super);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..88a6d2196ece 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -473,13 +473,18 @@ try_again:
        do {
                /*
                 * Subrequests are always contiguous, non overlapping
-                 * and in order. If not, it's a programming error.
+                 * and in order - but may be repeated (mirrored writes).
                 */
-                WARN_ON_ONCE(subreq->wb_offset !=
+                if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
-                     (head->wb_offset + total_bytes));
+                        /* keep track of how many bytes this group covers */
+                        total_bytes += subreq->wb_bytes;
-                /* keep track of how many bytes this group covers */
+                } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
-                total_bytes += subreq->wb_bytes;
+                            ((subreq->wb_offset + subreq->wb_bytes) >
+                             (head->wb_offset + total_bytes)))) {
+                        nfs_page_group_unlock(head);
+                        spin_unlock(&inode->i_lock);
+                        return ERR_PTR(-EIO);
+                }
                if (!nfs_lock_request(subreq)) {
                        /* releases page group bit lock and
@@ -786,7 +791,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
        spin_unlock(cinfo->lock);
        if (!cinfo->dreq) {
                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+                inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
                             BDI_RECLAIMABLE);
                __mark_inode_dirty(req->wb_context->dentry->d_inode,
                                   I_DIRTY_DATASYNC);
@@ -842,9 +847,9 @@ EXPORT_SYMBOL_GPL(nfs_init_cinfo);
 */
 void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-                        struct nfs_commit_info *cinfo)
+                        struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
-        if (pnfs_mark_request_commit(req, lseg, cinfo))
+        if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
                return;
        nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
 }
@@ -853,7 +858,7 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
        dec_zone_page_state(page, NR_UNSTABLE_NFS);
-        dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+        dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
 }
 /* Called holding inode (/cinfo) lock */
@@ -900,7 +905,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
                }
                if (nfs_write_need_commit(hdr)) {
                        memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
-                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+                        nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+                                hdr->pgio_mirror_idx);
                        goto next;
                }
 remove_req:
@@ -1091,6 +1097,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct nfs_lock_context *l_ctx;
+        struct file_lock_context *flctx = file_inode(file)->i_flctx;
        struct nfs_page *req;
        int do_flush, status;
        /*
@@ -1109,7 +1116,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                do_flush = req->wb_page != page || req->wb_context != ctx;
                /* for now, flush if more than 1 request in page_group */
                do_flush |= req->wb_this_page != req;
-                if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
+                if (l_ctx && flctx &&
+                    !(list_empty_careful(&flctx->flc_posix) &&
+                      list_empty_careful(&flctx->flc_flock))) {
                        do_flush |= l_ctx->lockowner.l_owner != current->files
                                || l_ctx->lockowner.l_pid != current->tgid;
                }
@@ -1170,6 +1179,13 @@ out:
        return PageUptodate(page) != 0;
 }
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+        return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+                        fl->fl_type == F_WRLCK;
+}
 /* If we know the page is up to date, and we're not using byte range locks (or
 * if we have the whole file locked for writing), it may be more efficient to
 * extend the write to cover the entire page in order to avoid fragmentation
@@ -1180,17 +1196,36 @@ out:
 */
 static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
 {
+        int ret;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct file_lock *fl;
        if (file->f_flags & O_DSYNC)
                return 0;
        if (!nfs_write_pageuptodate(page, inode))
                return 0;
        if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
                return 1;
-        if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
+        if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
-                        inode->i_flock->fl_end == OFFSET_MAX &&
+                       list_empty_careful(&flctx->flc_posix)))
-                        inode->i_flock->fl_type != F_RDLCK))
+                return 0;
-                return 1;
-        return 0;
+        /* Check to see if there are whole file write locks */
+        ret = 0;
+        spin_lock(&flctx->flc_lock);
+        if (!list_empty(&flctx->flc_posix)) {
+                fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+                                        fl_list);
+                if (is_whole_file_wrlock(fl))
+                        ret = 1;
+        } else if (!list_empty(&flctx->flc_flock)) {
+                fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+                                        fl_list);
+                if (fl->fl_type == F_WRLCK)
+                        ret = 1;
+        }
+        spin_unlock(&flctx->flc_lock);
+        return ret;
 }
 /*
@@ -1240,15 +1275,15 @@ static int flush_task_priority(int how)
 static void nfs_initiate_write(struct nfs_pgio_header *hdr,
                               struct rpc_message *msg,
+                               const struct nfs_rpc_ops *rpc_ops,
                               struct rpc_task_setup *task_setup_data, int how)
 {
-        struct inode *inode = hdr->inode;
        int priority = flush_task_priority(how);
        task_setup_data->priority = priority;
-        NFS_PROTO(inode)->write_setup(hdr, msg);
+        rpc_ops->write_setup(hdr, msg);
-        nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
+        nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
                                 &task_setup_data->rpc_client, msg, hdr);
 }
@@ -1298,8 +1333,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 {
+        struct nfs_pgio_mirror *mirror;
        pgio->pg_ops = &nfs_pgio_rw_ops;
-        pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+        nfs_pageio_stop_mirroring(pgio);
+        mirror = &pgio->pg_mirrors[0];
+        mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
@@ -1465,6 +1506,7 @@ void nfs_commitdata_release(struct nfs_commit_data *data)
 EXPORT_SYMBOL_GPL(nfs_commitdata_release);
 int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
+                        const struct nfs_rpc_ops *nfs_ops,
                        const struct rpc_call_ops *call_ops,
                        int how, int flags)
 {
@@ -1486,7 +1528,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
                .priority = priority,
        };
        /* Set up the initial task struct.  */
-        NFS_PROTO(data->inode)->commit_setup(data, &msg);
+        nfs_ops->commit_setup(data, &msg);
        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
@@ -1554,17 +1596,18 @@ EXPORT_SYMBOL_GPL(nfs_init_commit);
 void nfs_retry_commit(struct list_head *page_list,
                      struct pnfs_layout_segment *lseg,
-                      struct nfs_commit_info *cinfo)
+                      struct nfs_commit_info *cinfo,
+                      u32 ds_commit_idx)
 {
        struct nfs_page *req;
        while (!list_empty(page_list)) {
                req = nfs_list_entry(page_list->next);
                nfs_list_remove_request(req);
-                nfs_mark_request_commit(req, lseg, cinfo);
+                nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
                if (!cinfo->dreq) {
                        dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                        dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+                        dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
                                     BDI_RECLAIMABLE);
                }
                nfs_unlock_and_release_request(req);
@@ -1589,10 +1632,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
        /* Set up the argument struct */
        nfs_init_commit(data, head, NULL, cinfo);
        atomic_inc(&cinfo->mds->rpcs_out);
-        return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops,
+        return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
-                                   how, 0);
+                                   data->mds_ops, how, 0);
 out_bad:
-        nfs_retry_commit(head, NULL, cinfo);
+        nfs_retry_commit(head, NULL, cinfo, 0);
        cinfo->completion_ops->error_cleanup(NFS_I(inode));
        return -ENOMEM;
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
          If unsure, say N.
+config NFSD_PNFS
+        bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+        depends on NFSD_V4
+        help
+          This option enables support for the parallel NFS features of the
+          minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+          server.
+          If unsure, say N.
 config NFSD_V4_SECURITY_LABEL
        bool "Provide Security Label support for NFSv4 server"
        depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
 # Makefile for the Linux nfs server
 #
+ccflags-y += -I$(src)                   # needed for trace events
 obj-$(CONFIG_NFSD)      += nfsd.o
-nfsd-y                  := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y                  += trace.o
+nfsd-y                  += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
                           export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
                           nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/nfsd/debug.h>
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+#define NFSDDBG_FACILITY        NFSDDBG_PNFS
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev;
+        struct pnfs_block_volume *b;
+        dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+                      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+        if (!dev)
+                return -ENOMEM;
+        gdp->gd_device = dev;
+        dev->nr_volumes = 1;
+        b = &dev->volumes[0];
+        b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+        b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+        return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+                        &b->simple.offset);
+}
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        if (sb->s_bdev != sb->s_bdev->bd_contains)
+                return nfserr_inval;
+        return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+                struct nfsd4_layoutget *args)
+{
+        struct nfsd4_layout_seg *seg = &args->lg_seg;
+        struct super_block *sb = inode->i_sb;
+        u32 block_size = (1 << inode->i_blkbits);
+        struct pnfs_block_extent *bex;
+        struct iomap iomap;
+        u32 device_generation = 0;
+        int error;
+        /*
+         * We do not attempt to support I/O smaller than the fs block size,
+         * or not aligned to it.
+         */
+        if (args->lg_minlength < block_size) {
+                dprintk("pnfsd: I/O too small\n");
+                goto out_layoutunavailable;
+        }
+        if (seg->offset & (block_size - 1)) {
+                dprintk("pnfsd: I/O misaligned\n");
+                goto out_layoutunavailable;
+        }
+        /*
+         * Some clients barf on non-zero block numbers for NONE or INVALID
+         * layouts, so make sure to zero the whole structure.
+         */
+        error = -ENOMEM;
+        bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+        if (!bex)
+                goto out_error;
+        args->lg_content = bex;
+        error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+                                            &iomap, seg->iomode != IOMODE_READ,
+                                            &device_generation);
+        if (error) {
+                if (error == -ENXIO)
+                        goto out_layoutunavailable;
+                goto out_error;
+        }
+        if (iomap.length < args->lg_minlength) {
+                dprintk("pnfsd: extent smaller than minlength\n");
+                goto out_layoutunavailable;
+        }
+        switch (iomap.type) {
+        case IOMAP_MAPPED:
+                if (seg->iomode == IOMODE_READ)
+                        bex->es = PNFS_BLOCK_READ_DATA;
+                else
+                        bex->es = PNFS_BLOCK_READWRITE_DATA;
+                bex->soff = (iomap.blkno << 9);
+                break;
+        case IOMAP_UNWRITTEN:
+                if (seg->iomode & IOMODE_RW) {
+                        /*
+                         * Crack monkey special case from section 2.3.1.
+                         */
+                        if (args->lg_minlength == 0) {
+                                dprintk("pnfsd: no soup for you!\n");
+                                goto out_layoutunavailable;
+                        }
+                        bex->es = PNFS_BLOCK_INVALID_DATA;
+                        bex->soff = (iomap.blkno << 9);
+                        break;
+                }
+                /*FALLTHRU*/
+        case IOMAP_HOLE:
+                if (seg->iomode == IOMODE_READ) {
+                        bex->es = PNFS_BLOCK_NONE_DATA;
+                        break;
+                }
+                /*FALLTHRU*/
+        case IOMAP_DELALLOC:
+        default:
+                WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+                goto out_layoutunavailable;
+        }
+        error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+        if (error)
+                goto out_error;
+        bex->foff = iomap.offset;
+        bex->len = iomap.length;
+        seg->offset = iomap.offset;
+        seg->length = iomap.length;
+        dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+        return 0;
+out_error:
+        seg->length = 0;
+        return nfserrno(error);
+out_layoutunavailable:
+        seg->length = 0;
+        return nfserr_layoutunavailable;
+}
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+                struct nfsd4_layoutcommit *lcp)
+{
+        loff_t new_size = lcp->lc_last_wr + 1;
+        struct iattr iattr = { .ia_valid = 0 };
+        struct iomap *iomaps;
+        int nr_iomaps;
+        int error;
+        nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+                        lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+        if (nr_iomaps < 0)
+                return nfserrno(nr_iomaps);
+        if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+            timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+                lcp->lc_mtime = current_fs_time(inode->i_sb);
+        iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+        iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+        if (new_size > i_size_read(inode)) {
+                iattr.ia_valid |= ATTR_SIZE;
+                iattr.ia_size = new_size;
+        }
+        error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+                        nr_iomaps, &iattr);
+        kfree(iomaps);
+        return nfserrno(error);
+}
+const struct nfsd4_layout_ops bl_layout_ops = {
+        .proc_getdeviceinfo     = nfsd4_block_proc_getdeviceinfo,
+        .encode_getdeviceinfo   = nfsd4_block_encode_getdeviceinfo,
+        .proc_layoutget         = nfsd4_block_proc_layoutget,
+        .encode_layoutget       = nfsd4_block_encode_layoutget,
+        .proc_layoutcommit      = nfsd4_block_proc_layoutcommit,
+};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+#define NFSDDBG_FACILITY        NFSDDBG_PNFS
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+                struct nfsd4_layoutget *lgp)
+{
+        struct pnfs_block_extent *b = lgp->lg_content;
+        int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+        if (!p)
+                return nfserr_toosmall;
+        *p++ = cpu_to_be32(len);
+        *p++ = cpu_to_be32(1);          /* we always return a single extent */
+        p = xdr_encode_opaque_fixed(p, &b->vol_id,
+                        sizeof(struct nfsd4_deviceid));
+        p = xdr_encode_hyper(p, b->foff);
+        p = xdr_encode_hyper(p, b->len);
+        p = xdr_encode_hyper(p, b->soff);
+        *p++ = cpu_to_be32(b->es);
+        return 0;
+}
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+        __be32 *p;
+        int len;
+        switch (b->type) {
+        case PNFS_BLOCK_VOLUME_SIMPLE:
+                len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+                p = xdr_reserve_space(xdr, len);
+                if (!p)
+                        return -ETOOSMALL;
+                *p++ = cpu_to_be32(b->type);
+                *p++ = cpu_to_be32(1);  /* single signature */
+                p = xdr_encode_hyper(p, b->simple.offset);
+                p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+                break;
+        default:
+                return -ENOTSUPP;
+        }
+        return len;
+}
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+        int len = sizeof(__be32), ret, i;
+        __be32 *p;
+        p = xdr_reserve_space(xdr, len + sizeof(__be32));
+        if (!p)
+                return nfserr_resource;
+        for (i = 0; i < dev->nr_volumes; i++) {
+                ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+                if (ret < 0)
+                        return nfserrno(ret);
+                len += ret;
+        }
+        /*
+         * Fill in the overall length and number of volumes at the beginning
+         * of the layout.
+         */
+        *p++ = cpu_to_be32(len);
+        *p++ = cpu_to_be32(dev->nr_volumes);
+        return 0;
+}
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size)
+{
+        struct iomap *iomaps;
+        u32 nr_iomaps, expected, i;
+        if (len < sizeof(u32)) {
+                dprintk("%s: extent array too small: %u\n", __func__, len);
+                return -EINVAL;
+        }
+        nr_iomaps = be32_to_cpup(p++);
+        expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+        if (len != expected) {
+                dprintk("%s: extent array size mismatch: %u/%u\n",
+                        __func__, len, expected);
+                return -EINVAL;
+        }
+        iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+        if (!iomaps) {
+                dprintk("%s: failed to allocate extent array\n", __func__);
+                return -ENOMEM;
+        }
+        for (i = 0; i < nr_iomaps; i++) {
+                struct pnfs_block_extent bex;
+                memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+                p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+                p = xdr_decode_hyper(p, &bex.foff);
+                if (bex.foff & (block_size - 1)) {
+                        dprintk("%s: unaligned offset %lld\n",
+                                __func__, bex.foff);
+                        goto fail;
+                }
+                p = xdr_decode_hyper(p, &bex.len);
+                if (bex.len & (block_size - 1)) {
+                        dprintk("%s: unaligned length %lld\n",
+                                __func__, bex.foff);
+                        goto fail;
+                }
+                p = xdr_decode_hyper(p, &bex.soff);
+                if (bex.soff & (block_size - 1)) {
+                        dprintk("%s: unaligned disk offset %lld\n",
+                                __func__, bex.soff);
+                        goto fail;
+                }
+                bex.es = be32_to_cpup(p++);
+                if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+                        dprintk("%s: incorrect extent state %d\n",
+                                __func__, bex.es);
+                        goto fail;
+                }
+                iomaps[i].offset = bex.foff;
+                iomaps[i].length = bex.len;
+        }
+        *iomapp = iomaps;
+        return nr_iomaps;
+fail:
+        kfree(iomaps);
+        return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+#include <linux/blkdev.h>
+#include "xdr4.h"
+struct iomap;
+struct xdr_stream;
+enum pnfs_block_extent_state {
+        PNFS_BLOCK_READWRITE_DATA       = 0,
+        PNFS_BLOCK_READ_DATA            = 1,
+        PNFS_BLOCK_INVALID_DATA         = 2,
+        PNFS_BLOCK_NONE_DATA            = 3,
+};
+struct pnfs_block_extent {
+        struct nfsd4_deviceid           vol_id;
+        u64                             foff;
+        u64                             len;
+        u64                             soff;
+        enum pnfs_block_extent_state    es;
+};
+#define NFS4_BLOCK_EXTENT_SIZE          44
+enum pnfs_block_volume_type {
+        PNFS_BLOCK_VOLUME_SIMPLE        = 0,
+        PNFS_BLOCK_VOLUME_SLICE         = 1,
+        PNFS_BLOCK_VOLUME_CONCAT        = 2,
+        PNFS_BLOCK_VOLUME_STRIPE        = 3,
+};
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN     128
+struct pnfs_block_volume {
+        enum pnfs_block_volume_type     type;
+        union {
+                struct {
+                        u64             offset;
+                        u32             sig_len;
+                        u8              sig[PNFS_BLOCK_UUID_LEN];
+                } simple;
+        };
+};
+struct pnfs_block_deviceaddr {
+        u32                             nr_volumes;
+        struct pnfs_block_volume        volumes[];
+};
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+                struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+                struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size);
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
 #include "nfsd.h"
 #include "nfsfh.h"
 #include "netns.h"
+#include "pnfs.h"
 #define NFSDDBG_FACILITY        NFSDDBG_EXPORT
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        exp.ex_client = dom;
        exp.cd = cd;
+        exp.ex_devid_map = NULL;
        /* expiry */
        err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                if (!gid_valid(exp.ex_anon_gid))
                        goto out4;
                err = 0;
+                nfsd4_setup_layout_type(&exp);
        }
        expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
        new->ex_fslocs.locations = NULL;
        new->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = 0;
+        new->ex_layout_type = 0;
        new->ex_uuid = NULL;
        new->cd = item->cd;
 }
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        new->ex_anon_uid = item->ex_anon_uid;
        new->ex_anon_gid = item->ex_anon_gid;
        new->ex_fsid = item->ex_fsid;
+        new->ex_devid_map = item->ex_devid_map;
+        item->ex_devid_map = NULL;
        new->ex_uuid = item->ex_uuid;
        item->ex_uuid = NULL;
        new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        item->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = item->ex_fslocs.migrated;
        item->ex_fslocs.migrated = 0;
+        new->ex_layout_type = item->ex_layout_type;
        new->ex_nflavors = item->ex_nflavors;
        for (i = 0; i < MAX_SECINFO_LIST; i++) {
                new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
        struct nfsd4_fs_locations ex_fslocs;
        uint32_t                ex_nflavors;
        struct exp_flavor_info  ex_flavors[MAX_SECINFO_LIST];
+        enum pnfs_layouttype    ex_layout_type;
+        struct nfsd4_deviceid_map *ex_devid_map;
        struct cache_detail     *cd;
 };
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
        return status;
 }
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ *      struct layoutrecall_file4 {
+ *              nfs_fh4         lor_fh;
+ *              offset4         lor_offset;
+ *              length4         lor_length;
+ *              stateid4        lor_stateid;
+ *      };
+ *
+ *      union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ *      case LAYOUTRECALL4_FILE:
+ *              layoutrecall_file4 lor_layout;
+ *      case LAYOUTRECALL4_FSID:
+ *              fsid4              lor_fsid;
+ *      case LAYOUTRECALL4_ALL:
+ *              void;
+ *      };
+ *
+ *      struct CB_LAYOUTRECALL4args {
+ *              layouttype4             clora_type;
+ *              layoutiomode4           clora_iomode;
+ *              bool                    clora_changed;
+ *              layoutrecall4           clora_recall;
+ *      };
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+                                  const struct nfs4_layout_stateid *ls,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        BUG_ON(hdr->minorversion == 0);
+        p = xdr_reserve_space(xdr, 5 * 4);
+        *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+        *p++ = cpu_to_be32(ls->ls_layout_type);
+        *p++ = cpu_to_be32(IOMODE_ANY);
+        *p++ = cpu_to_be32(1);
+        *p = cpu_to_be32(RETURN_FILE);
+        encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+        p = xdr_reserve_space(xdr, 2 * 8);
+        p = xdr_encode_hyper(p, 0);
+        xdr_encode_hyper(p, NFS4_MAX_UINT64);
+        encode_stateid4(xdr, &ls->ls_recall_sid);
+        hdr->nops++;
+}
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = 0,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_layout4args(xdr, ls, &hdr);
+        encode_cb_nops(&hdr);
+}
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
+        int status;
+        status = decode_cb_compound4res(xdr, &hdr);
+        if (unlikely(status))
+                goto out;
+        if (cb) {
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
+                        goto out;
+        }
+        status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                status = nfs_cb_stat_to_errno(nfserr);
+out:
+        return status;
+}
+#endif /* CONFIG_NFSD_PNFS */
 /*
 * RPC procedure tables
 */
@@ -563,6 +659,9 @@ out:
 static struct rpc_procinfo nfs4_cb_procedures[] = {
        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+        PROC(CB_LAYOUT, COMPOUND,       cb_layout,      cb_layout),
+#endif
 };
 static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+struct nfs4_layout {
+        struct list_head                lo_perstate;
+        struct nfs4_layout_stateid      *lo_state;
+        struct nfsd4_layout_seg         lo_seg;
+};
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+        [LAYOUT_BLOCK_VOLUME]   = &bl_layout_ops,
+};
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS 8
+#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+static inline u32 devid_hashfn(u64 idx)
+{
+        return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+        const struct knfsd_fh *fh = &fhp->fh_handle;
+        size_t fsid_len = key_len(fh->fh_fsid_type);
+        struct nfsd4_deviceid_map *map, *old;
+        int i;
+        map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+        if (!map)
+                return;
+        map->fsid_type = fh->fh_fsid_type;
+        memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+        spin_lock(&nfsd_devid_lock);
+        if (fhp->fh_export->ex_devid_map)
+                goto out_unlock;
+        for (i = 0; i < DEVID_HASH_SIZE; i++) {
+                list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+                        if (old->fsid_type != fh->fh_fsid_type)
+                                continue;
+                        if (memcmp(old->fsid, fh->fh_fsid,
+                                        key_len(old->fsid_type)))
+                                continue;
+                        fhp->fh_export->ex_devid_map = old;
+                        goto out_unlock;
+                }
+        }
+        map->idx = nfsd_devid_seq++;
+        list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+        fhp->fh_export->ex_devid_map = map;
+        map = NULL;
+out_unlock:
+        spin_unlock(&nfsd_devid_lock);
+        kfree(map);
+}
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+        struct nfsd4_deviceid_map *map, *ret = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+                if (map->idx == idx)
+                        ret = map;
+        rcu_read_unlock();
+        return ret;
+}
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+                u32 device_generation)
+{
+        if (!fhp->fh_export->ex_devid_map) {
+                nfsd4_alloc_devid_map(fhp);
+                if (!fhp->fh_export->ex_devid_map)
+                        return -ENOMEM;
+        }
+        id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+        id->generation = device_generation;
+        id->pad = 0;
+        return 0;
+}
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+        struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+        if (exp->ex_flags & NFSEXP_NOPNFS)
+                return;
+        if (sb->s_export_op->get_uuid &&
+            sb->s_export_op->map_blocks &&
+            sb->s_export_op->commit_blocks)
+                exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+}
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+        struct nfs4_layout_stateid *ls = layoutstateid(stid);
+        struct nfs4_client *clp = ls->ls_stid.sc_client;
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+        spin_lock(&clp->cl_lock);
+        list_del_init(&ls->ls_perclnt);
+        spin_unlock(&clp->cl_lock);
+        spin_lock(&fp->fi_lock);
+        list_del_init(&ls->ls_perfile);
+        spin_unlock(&fp->fi_lock);
+        vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+        fput(ls->ls_file);
+        if (ls->ls_recalled)
+                atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+        kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+        struct file_lock *fl;
+        int status;
+        fl = locks_alloc_lock();
+        if (!fl)
+                return -ENOMEM;
+        locks_init_lock(fl);
+        fl->fl_lmops = &nfsd4_layouts_lm_ops;
+        fl->fl_flags = FL_LAYOUT;
+        fl->fl_type = F_RDLCK;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner = ls;
+        fl->fl_pid = current->tgid;
+        fl->fl_file = ls->ls_file;
+        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+        if (status) {
+                locks_free_lock(fl);
+                return status;
+        }
+        BUG_ON(fl != NULL);
+        return 0;
+}
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+                struct nfs4_stid *parent, u32 layout_type)
+{
+        struct nfs4_client *clp = cstate->clp;
+        struct nfs4_file *fp = parent->sc_file;
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_stid *stp;
+        stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+        if (!stp)
+                return NULL;
+        stp->sc_free = nfsd4_free_layout_stateid;
+        get_nfs4_file(fp);
+        stp->sc_file = fp;
+        ls = layoutstateid(stp);
+        INIT_LIST_HEAD(&ls->ls_perclnt);
+        INIT_LIST_HEAD(&ls->ls_perfile);
+        spin_lock_init(&ls->ls_lock);
+        INIT_LIST_HEAD(&ls->ls_layouts);
+        ls->ls_layout_type = layout_type;
+        nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+                        NFSPROC4_CLNT_CB_LAYOUT);
+        if (parent->sc_type == NFS4_DELEG_STID)
+                ls->ls_file = get_file(fp->fi_deleg_file);
+        else
+                ls->ls_file = find_any_file(fp);
+        BUG_ON(!ls->ls_file);
+        if (nfsd4_layout_setlease(ls)) {
+                put_nfs4_file(fp);
+                kmem_cache_free(nfs4_layout_stateid_cache, ls);
+                return NULL;
+        }
+        spin_lock(&clp->cl_lock);
+        stp->sc_type = NFS4_LAYOUT_STID;
+        list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+        spin_unlock(&clp->cl_lock);
+        spin_lock(&fp->fi_lock);
+        list_add(&ls->ls_perfile, &fp->fi_lo_states);
+        spin_unlock(&fp->fi_lock);
+        trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+        return ls;
+}
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate, stateid_t *stateid,
+                bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_stid *stid;
+        unsigned char typemask = NFS4_LAYOUT_STID;
+        __be32 status;
+        if (create)
+                typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+        status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+                        net_generic(SVC_NET(rqstp), nfsd_net_id));
+        if (status)
+                goto out;
+        if (!fh_match(&cstate->current_fh.fh_handle,
+                      &stid->sc_file->fi_fhandle)) {
+                status = nfserr_bad_stateid;
+                goto out_put_stid;
+        }
+        if (stid->sc_type != NFS4_LAYOUT_STID) {
+                ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+                nfs4_put_stid(stid);
+                status = nfserr_jukebox;
+                if (!ls)
+                        goto out;
+        } else {
+                ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+                status = nfserr_bad_stateid;
+                if (stateid->si_generation > stid->sc_stateid.si_generation)
+                        goto out_put_stid;
+                if (layout_type != ls->ls_layout_type)
+                        goto out_put_stid;
+        }
+        *lsp = ls;
+        return 0;
+out_put_stid:
+        nfs4_put_stid(stid);
+out:
+        return status;
+}
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+        spin_lock(&ls->ls_lock);
+        if (ls->ls_recalled)
+                goto out_unlock;
+        ls->ls_recalled = true;
+        atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+        if (list_empty(&ls->ls_layouts))
+                goto out_unlock;
+        trace_layout_recall(&ls->ls_stid.sc_stateid);
+        atomic_inc(&ls->ls_stid.sc_count);
+        update_stateid(&ls->ls_stid.sc_stateid);
+        memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+        nfsd4_run_cb(&ls->ls_recall);
+out_unlock:
+        spin_unlock(&ls->ls_lock);
+}
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+        u64 end = seg->offset + seg->length;
+        return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+        if (end == NFS4_MAX_UINT64)
+                lo->length = NFS4_MAX_UINT64;
+        else
+                lo->length = end - lo->offset;
+}
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+        if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+                return false;
+        if (layout_end(&lo->lo_seg) <= s->offset)
+                return false;
+        if (layout_end(s) <= lo->lo_seg.offset)
+                return false;
+        return true;
+}
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+        if (lo->iomode != new->iomode)
+                return false;
+        if (layout_end(new) < lo->offset)
+                return false;
+        if (layout_end(lo) < new->offset)
+                return false;
+        lo->offset = min(lo->offset, new->offset);
+        layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+        return true;
+}
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        struct nfs4_layout_stateid *l, *n;
+        __be32 nfserr = nfs_ok;
+        assert_spin_locked(&fp->fi_lock);
+        list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+                if (l != ls) {
+                        nfsd4_recall_file_layout(l);
+                        nfserr = nfserr_recallconflict;
+                }
+        }
+        return nfserr;
+}
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+        struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        struct nfs4_layout *lp, *new = NULL;
+        __be32 nfserr;
+        spin_lock(&fp->fi_lock);
+        nfserr = nfsd4_recall_conflict(ls);
+        if (nfserr)
+                goto out;
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+                if (layouts_try_merge(&lp->lo_seg, seg))
+                        goto done;
+        }
+        spin_unlock(&ls->ls_lock);
+        spin_unlock(&fp->fi_lock);
+        new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+        if (!new)
+                return nfserr_jukebox;
+        memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+        new->lo_state = ls;
+        spin_lock(&fp->fi_lock);
+        nfserr = nfsd4_recall_conflict(ls);
+        if (nfserr)
+                goto out;
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+                if (layouts_try_merge(&lp->lo_seg, seg))
+                        goto done;
+        }
+        atomic_inc(&ls->ls_stid.sc_count);
+        list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+        new = NULL;
+done:
+        update_stateid(&ls->ls_stid.sc_stateid);
+        memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+        spin_unlock(&ls->ls_lock);
+out:
+        spin_unlock(&fp->fi_lock);
+        if (new)
+                kmem_cache_free(nfs4_layout_cache, new);
+        return nfserr;
+}
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+        while (!list_empty(reaplist)) {
+                struct nfs4_layout *lp = list_first_entry(reaplist,
+                                struct nfs4_layout, lo_perstate);
+                list_del(&lp->lo_perstate);
+                nfs4_put_stid(&lp->lo_state->ls_stid);
+                kmem_cache_free(nfs4_layout_cache, lp);
+        }
+}
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+                struct list_head *reaplist)
+{
+        struct nfsd4_layout_seg *lo = &lp->lo_seg;
+        u64 end = layout_end(lo);
+        if (seg->offset <= lo->offset) {
+                if (layout_end(seg) >= end) {
+                        list_move_tail(&lp->lo_perstate, reaplist);
+                        return;
+                }
+                end = seg->offset;
+        } else {
+                /* retain the whole layout segment on a split. */
+                if (layout_end(seg) < end) {
+                        dprintk("%s: split not supported\n", __func__);
+                        return;
+                }
+                lo->offset = layout_end(seg);
+        }
+        layout_update_len(lo, end);
+}
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_layout *lp, *n;
+        LIST_HEAD(reaplist);
+        __be32 nfserr;
+        int found = 0;
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+                                                false, lrp->lr_layout_type,
+                                                &ls);
+        if (nfserr) {
+                trace_layout_return_lookup_fail(&lrp->lr_sid);
+                return nfserr;
+        }
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+                if (layouts_overlapping(lp, &lrp->lr_seg)) {
+                        nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+                        found++;
+                }
+        }
+        if (!list_empty(&ls->ls_layouts)) {
+                if (found) {
+                        update_stateid(&ls->ls_stid.sc_stateid);
+                        memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+                                sizeof(stateid_t));
+                }
+                lrp->lrs_present = 1;
+        } else {
+                trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+                nfs4_unhash_stid(&ls->ls_stid);
+                lrp->lrs_present = 0;
+        }
+        spin_unlock(&ls->ls_lock);
+        nfs4_put_stid(&ls->ls_stid);
+        nfsd4_free_layouts(&reaplist);
+        return nfs_ok;
+}
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        struct nfs4_client *clp = cstate->clp;
+        struct nfs4_layout *lp, *t;
+        LIST_HEAD(reaplist);
+        lrp->lrs_present = 0;
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+                if (lrp->lr_return_type == RETURN_FSID &&
+                    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+                                   &cstate->current_fh.fh_handle))
+                        continue;
+                spin_lock(&ls->ls_lock);
+                list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+                        if (lrp->lr_seg.iomode == IOMODE_ANY ||
+                            lrp->lr_seg.iomode == lp->lo_seg.iomode)
+                                list_move_tail(&lp->lo_perstate, &reaplist);
+                }
+                spin_unlock(&ls->ls_lock);
+        }
+        spin_unlock(&clp->cl_lock);
+        nfsd4_free_layouts(&reaplist);
+        return 0;
+}
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+                struct list_head *reaplist)
+{
+        spin_lock(&ls->ls_lock);
+        list_splice_init(&ls->ls_layouts, reaplist);
+        spin_unlock(&ls->ls_lock);
+}
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        LIST_HEAD(reaplist);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+                nfsd4_return_all_layouts(ls, &reaplist);
+        spin_unlock(&clp->cl_lock);
+        nfsd4_free_layouts(&reaplist);
+}
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        LIST_HEAD(reaplist);
+        spin_lock(&fp->fi_lock);
+        list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+                if (ls->ls_stid.sc_client == clp)
+                        nfsd4_return_all_layouts(ls, &reaplist);
+        }
+        spin_unlock(&fp->fi_lock);
+        nfsd4_free_layouts(&reaplist);
+}
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+        struct nfs4_client *clp = ls->ls_stid.sc_client;
+        char addr_str[INET6_ADDRSTRLEN];
+        static char *envp[] = {
+                "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL
+        };
+        char *argv[8];
+        int error;
+        rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+        nfsd4_cb_layout_fail(ls);
+        printk(KERN_WARNING
+                "nfsd: client %s failed to respond to layout recall. "
+                "  Fencing..\n", addr_str);
+        argv[0] = "/sbin/nfsd-recall-failed";
+        argv[1] = addr_str;
+        argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+        argv[3] = NULL;
+        error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        if (error) {
+                printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+                        addr_str, error);
+        }
+}
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+        struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        LIST_HEAD(reaplist);
+        switch (task->tk_status) {
+        case 0:
+                return 1;
+        case -NFS4ERR_NOMATCHING_LAYOUT:
+                trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+                task->tk_status = 0;
+                return 1;
+        case -NFS4ERR_DELAY:
+                /* Poll the client until it's done with the layout */
+                /* FIXME: cap number of retries.
+                 * The pnfs standard states that we need to only expire
+                 * the client after at-least "lease time" .eg lease-time * 2
+                 * when failing to communicate a recall
+                 */
+                rpc_delay(task, HZ/100); /* 10 mili-seconds */
+                return 0;
+        default:
+                /*
+                 * Unknown error or non-responding client, we'll need to fence.
+                 */
+                nfsd4_cb_layout_fail(ls);
+                return -1;
+        }
+}
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+        struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        LIST_HEAD(reaplist);
+        trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+        nfsd4_return_all_layouts(ls, &reaplist);
+        nfsd4_free_layouts(&reaplist);
+        nfs4_put_stid(&ls->ls_stid);
+}
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+        .done           = nfsd4_cb_layout_done,
+        .release        = nfsd4_cb_layout_release,
+};
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+        /*
+         * We don't want the locks code to timeout the lease for us;
+         * we'll remove it ourself if a layout isn't returned
+         * in time:
+         */
+        fl->fl_break_time = 0;
+        nfsd4_recall_file_layout(fl->fl_owner);
+        return false;
+}
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+                struct list_head *dispose)
+{
+        BUG_ON(!(arg & F_UNLCK));
+        return lease_modify(onlist, arg, dispose);
+}
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+        .lm_break       = nfsd4_layout_lm_break,
+        .lm_change      = nfsd4_layout_lm_change,
+};
+int
+nfsd4_init_pnfs(void)
+{
+        int i;
+        for (i = 0; i < DEVID_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+        nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+                        sizeof(struct nfs4_layout), 0, 0, NULL);
+        if (!nfs4_layout_cache)
+                return -ENOMEM;
+        nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+                        sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+        if (!nfs4_layout_stateid_cache) {
+                kmem_cache_destroy(nfs4_layout_cache);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void
+nfsd4_exit_pnfs(void)
+{
+        int i;
+        kmem_cache_destroy(nfs4_layout_cache);
+        kmem_cache_destroy(nfs4_layout_stateid_cache);
+        for (i = 0; i < DEVID_HASH_SIZE; i++) {
+                struct nfsd4_deviceid_map *map, *n;
+                list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+                        kfree(map);
+        }
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
 #include "current_stateid.h"
 #include "netns.h"
 #include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return status == nfserr_same ? nfs_ok : status;
 }
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+        if (!exp->ex_layout_type) {
+                dprintk("%s: export does not support pNFS\n", __func__);
+                return NULL;
+        }
+        if (exp->ex_layout_type != layout_type) {
+                dprintk("%s: layout type %d not supported\n",
+                        __func__, layout_type);
+                return NULL;
+        }
+        return nfsd4_layout_ops[layout_type];
+}
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        const struct nfsd4_layout_ops *ops;
+        struct nfsd4_deviceid_map *map;
+        struct svc_export *exp;
+        __be32 nfserr;
+        dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+               __func__,
+               gdp->gd_layout_type,
+               gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+               gdp->gd_maxcount);
+        map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+        if (!map) {
+                dprintk("%s: couldn't find device ID to export mapping!\n",
+                        __func__);
+                return nfserr_noent;
+        }
+        exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+        if (IS_ERR(exp)) {
+                dprintk("%s: could not find device id\n", __func__);
+                return nfserr_noent;
+        }
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+        if (!ops)
+                goto out;
+        nfserr = nfs_ok;
+        if (gdp->gd_maxcount != 0)
+                nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+        gdp->gd_notify_types &= ops->notify_types;
+        exp_put(exp);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutget *lgp)
+{
+        struct svc_fh *current_fh = &cstate->current_fh;
+        const struct nfsd4_layout_ops *ops;
+        struct nfs4_layout_stateid *ls;
+        __be32 nfserr;
+        int accmode;
+        switch (lgp->lg_seg.iomode) {
+        case IOMODE_READ:
+                accmode = NFSD_MAY_READ;
+                break;
+        case IOMODE_RW:
+                accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+                break;
+        default:
+                dprintk("%s: invalid iomode %d\n",
+                        __func__, lgp->lg_seg.iomode);
+                nfserr = nfserr_badiomode;
+                goto out;
+        }
+        nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+        if (!ops)
+                goto out;
+        /*
+         * Verify minlength and range as per RFC5661:
+         *  o  If loga_length is less than loga_minlength,
+         *     the metadata server MUST return NFS4ERR_INVAL.
+         *  o  If the sum of loga_offset and loga_minlength exceeds
+         *     NFS4_UINT64_MAX, and loga_minlength is not
+         *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+         *  o  If the sum of loga_offset and loga_length exceeds
+         *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+         *     the error NFS4ERR_INVAL MUST result.
+         */
+        nfserr = nfserr_inval;
+        if (lgp->lg_seg.length < lgp->lg_minlength ||
+            (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+             lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+            (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+             lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+                goto out;
+        if (lgp->lg_seg.length == 0)
+                goto out;
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+                                                true, lgp->lg_layout_type, &ls);
+        if (nfserr) {
+                trace_layout_get_lookup_fail(&lgp->lg_sid);
+                goto out;
+        }
+        nfserr = nfserr_recallconflict;
+        if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+                goto out_put_stid;
+        nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+                                     current_fh, lgp);
+        if (nfserr)
+                goto out_put_stid;
+        nfserr = nfsd4_insert_layout(lgp, ls);
+out_put_stid:
+        nfs4_put_stid(&ls->ls_stid);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutcommit *lcp)
+{
+        const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+        struct svc_fh *current_fh = &cstate->current_fh;
+        const struct nfsd4_layout_ops *ops;
+        loff_t new_size = lcp->lc_last_wr + 1;
+        struct inode *inode;
+        struct nfs4_layout_stateid *ls;
+        __be32 nfserr;
+        nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+        if (!ops)
+                goto out;
+        inode = current_fh->fh_dentry->d_inode;
+        nfserr = nfserr_inval;
+        if (new_size <= seg->offset) {
+                dprintk("pnfsd: last write before layout segment\n");
+                goto out;
+        }
+        if (new_size > seg->offset + seg->length) {
+                dprintk("pnfsd: last write beyond layout segment\n");
+                goto out;
+        }
+        if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+                dprintk("pnfsd: layoutcommit beyond EOF\n");
+                goto out;
+        }
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+                                                false, lcp->lc_layout_type,
+                                                &ls);
+        if (nfserr) {
+                trace_layout_commit_lookup_fail(&lcp->lc_sid);
+                /* fixup error code as per RFC5661 */
+                if (nfserr == nfserr_bad_stateid)
+                        nfserr = nfserr_badlayout;
+                goto out;
+        }
+        nfserr = ops->proc_layoutcommit(inode, lcp);
+        if (nfserr)
+                goto out_put_stid;
+        if (new_size > i_size_read(inode)) {
+                lcp->lc_size_chg = 1;
+                lcp->lc_newsize = new_size;
+        } else {
+                lcp->lc_size_chg = 0;
+        }
+out_put_stid:
+        nfs4_put_stid(&ls->ls_stid);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct svc_fh *current_fh = &cstate->current_fh;
+        __be32 nfserr;
+        nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+                goto out;
+        switch (lrp->lr_seg.iomode) {
+        case IOMODE_READ:
+        case IOMODE_RW:
+        case IOMODE_ANY:
+                break;
+        default:
+                dprintk("%s: invalid iomode %d\n", __func__,
+                        lrp->lr_seg.iomode);
+                nfserr = nfserr_inval;
+                goto out;
+        }
+        switch (lrp->lr_return_type) {
+        case RETURN_FILE:
+                nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+                break;
+        case RETURN_FSID:
+        case RETURN_ALL:
+                nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+                break;
+        default:
+                dprintk("%s: invalid return_type %d\n", __func__,
+                        lrp->lr_return_type);
+                nfserr = nfserr_inval;
+                break;
+        }
+out:
+        return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
 /*
 * NULL call.
 */
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
                op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE         128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* logr_return_on_close */ +
+                op_encode_stateid_maxsz +
+                1 /* nr of layouts */ +
+                MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* locr_newsize */ +
+                2 /* ns_size */) * sizeof(__be32);
+}
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* lrs_stateid */ +
+                op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
 static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO] = {
+                .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_name = "OP_GETDEVICEINFO",
+        },
+        [OP_LAYOUTGET] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutget,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTGET",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+        },
+        [OP_LAYOUTCOMMIT] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutcommit,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTCOMMIT",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+        },
+        [OP_LAYOUTRETURN] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutreturn,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTRETURN",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+        },
+#endif /* CONFIG_NFSD_PNFS */
        /* NFSv4.2 operations */
        [OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c06a1ba80d73..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include "current_stateid.h"
 #include "netns.h"
+#include "pnfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
        clp->cl_time = get_seconds();
 }
-static inline void
-renew_client(struct nfs4_client *clp)
-{
-        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        spin_lock(&nn->client_lock);
-        renew_client_locked(clp);
-        spin_unlock(&nn->client_lock);
-}
 static void put_client_renew_locked(struct nfs4_client *clp)
 {
        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
        kmem_cache_free(file_slab, fp);
 }
-static inline void
+void
 put_nfs4_file(struct nfs4_file *fi)
 {
        might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
        }
 }
-static inline void
-get_nfs4_file(struct nfs4_file *fi)
-{
-        atomic_inc(&fi->fi_ref);
-}
 static struct file *
 __nfs4_get_fd(struct nfs4_file *f, int oflag)
 {
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
        return ret;
 }
-static struct file *
+struct file *
 find_any_file(struct nfs4_file *f)
 {
        struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
        return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
 }
-static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
-{
-        return fh1->fh_size == fh2->fh_size &&
-                !memcmp(fh1->fh_base.fh_pad,
-                                fh2->fh_base.fh_pad,
-                                fh1->fh_size);
-}
 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
 static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
                __nfs4_file_put_access(fp, O_RDONLY);
 }
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
                                         struct kmem_cache *slab)
 {
        struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
        struct file *filp = NULL;
        spin_lock(&fp->fi_lock);
-        if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
+        if (fp->fi_deleg_file && --fp->fi_delegees == 0)
                swap(filp, fp->fi_deleg_file);
        spin_unlock(&fp->fi_lock);
        if (filp) {
-                vfs_setlease(filp, F_UNLCK, NULL, NULL);
+                vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
                fput(filp);
        }
 }
-static void unhash_stid(struct nfs4_stid *s)
+void nfs4_unhash_stid(struct nfs4_stid *s)
 {
        s->sc_type = 0;
 }
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
        list_del_init(&stp->st_locks);
        unhash_ol_stateid(stp);
-        unhash_stid(&stp->st_stid);
+        nfs4_unhash_stid(&stp->st_stid);
 }
 static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
 static int
 STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
 {
-        if (clid->cl_boot == nn->boot_time)
+        /*
+         * We're assuming the clid was not given out from a boot
+         * precisely 2^32 (about 136 years) before this one.  That seems
+         * a safe assumption:
+         */
+        if (clid->cl_boot == (u32)nn->boot_time)
                return 0;
        dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
                clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        INIT_LIST_HEAD(&clp->cl_lru);
        INIT_LIST_HEAD(&clp->cl_callbacks);
        INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+        INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
        spin_lock_init(&clp->cl_lock);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
                nfs4_get_stateowner(&oo->oo_owner);
                release_openowner(oo);
        }
+        nfsd4_return_all_client_layouts(clp);
        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 static void
 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 {
-        /* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
        /* Referrals are supported, Migration is not. */
        new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
        fp->fi_share_deny = 0;
        memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
        memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+        INIT_LIST_HEAD(&fp->fi_lo_states);
+        atomic_set(&fp->fi_lo_recalls, 0);
+#endif
        hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        struct nfs4_file *fp;
        hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
-                if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
+                if (fh_match(&fp->fi_fhandle, fh)) {
                        if (atomic_inc_not_zero(&fp->fi_ref))
                                return fp;
                }
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        return NULL;
 }
-static struct nfs4_file *
+struct nfs4_file *
 find_file(struct knfsd_fh *fh)
 {
        struct nfs4_file *fp;
@@ -3477,7 +3470,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 }
 static int
-nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+                     struct list_head *dispose)
 {
        if (arg & F_UNLCK)
                return lease_modify(onlist, arg, dispose);
@@ -3855,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
        /* Race breaker */
        if (fp->fi_deleg_file) {
                status = 0;
-                atomic_inc(&fp->fi_delegees);
+                ++fp->fi_delegees;
                hash_delegation_locked(dp, fp);
                goto out_unlock;
        }
        fp->fi_deleg_file = filp;
-        atomic_set(&fp->fi_delegees, 1);
+        fp->fi_delegees = 1;
        hash_delegation_locked(dp, fp);
        spin_unlock(&fp->fi_lock);
        spin_unlock(&state_lock);
@@ -3901,7 +3895,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                status = -EAGAIN;
                goto out_unlock;
        }
-        atomic_inc(&fp->fi_delegees);
+        ++fp->fi_delegees;
        hash_delegation_locked(dp, fp);
        status = 0;
 out_unlock:
@@ -4294,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
-        if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+        if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
                return nfserr_bad_stateid;
        return nfs_ok;
 }
@@ -4445,7 +4439,7 @@ out_unlock:
        return status;
 }
-static __be32
+__be32
 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
                     stateid_t *stateid, unsigned char typemask,
                     struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4859,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+        nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+                                      stp->st_stid.sc_file);
        nfsd4_close_open_stateid(stp);
        /* put reference from nfs4_preprocess_seqid_op */
@@ -5556,10 +5553,11 @@ out_nfserr:
 static bool
 check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 {
-        struct file_lock **flpp;
+        struct file_lock *fl;
        int status = false;
        struct file *filp = find_any_file(fp);
        struct inode *inode;
+        struct file_lock_context *flctx;
        if (!filp) {
                /* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5566,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
        }
        inode = file_inode(filp);
+        flctx = inode->i_flctx;
-        spin_lock(&inode->i_lock);
+        if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-        for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
+                spin_lock(&flctx->flc_lock);
-                if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
+                list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-                        status = true;
+                        if (fl->fl_owner == (fl_owner_t)lowner) {
-                        break;
+                                status = true;
+                                break;
+                        }
                }
+                spin_unlock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
        fput(filp);
        return status;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
 #include "state.h"
 #include "cache.h"
 #include "netns.h"
+#include "pnfs.h"
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
        return ret;
 }
+/*
+ * We require the high 32 bits of 'seconds' to be 0, and
+ * we ignore all 32 bits of 'nseconds'.
+ */
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+{
+        DECODE_HEAD;
+        u64 sec;
+        READ_BUF(12);
+        p = xdr_decode_hyper(p, &sec);
+        tv->tv_sec = sec;
+        tv->tv_nsec = be32_to_cpup(p++);
+        if (tv->tv_nsec >= (u32)1000000000)
+                return nfserr_inval;
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 {
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 {
        int expected_len, len = 0;
        u32 dummy32;
-        u64 sec;
        char *buf;
        DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                dummy32 = be32_to_cpup(p++);
                switch (dummy32) {
                case NFS4_SET_TO_CLIENT_TIME:
-                        /* We require the high 32 bits of 'seconds' to be 0, and we ignore
-                           all 32 bits of 'nseconds'. */
-                        READ_BUF(12);
                        len += 12;
-                        p = xdr_decode_hyper(p, &sec);
+                        status = nfsd4_decode_time(argp, &iattr->ia_atime);
-                        iattr->ia_atime.tv_sec = (time_t)sec;
+                        if (status)
-                        iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
+                                return status;
-                        if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
-                                return nfserr_inval;
                        iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
                        break;
                case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                dummy32 = be32_to_cpup(p++);
                switch (dummy32) {
                case NFS4_SET_TO_CLIENT_TIME:
-                        /* We require the high 32 bits of 'seconds' to be 0, and we ignore
-                           all 32 bits of 'nseconds'. */
-                        READ_BUF(12);
                        len += 12;
-                        p = xdr_decode_hyper(p, &sec);
+                        status = nfsd4_decode_time(argp, &iattr->ia_mtime);
-                        iattr->ia_mtime.tv_sec = sec;
+                        if (status)
-                        iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
+                                return status;
-                        if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
-                                return nfserr_inval;
                        iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
                        break;
                case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
        DECODE_TAIL;
 }
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+                struct nfsd4_getdeviceinfo *gdev)
+{
+        DECODE_HEAD;
+        u32 num, i;
+        READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+        COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+        gdev->gd_layout_type = be32_to_cpup(p++);
+        gdev->gd_maxcount = be32_to_cpup(p++);
+        num = be32_to_cpup(p++);
+        if (num) {
+                READ_BUF(4 * num);
+                gdev->gd_notify_types = be32_to_cpup(p++);
+                for (i = 1; i < num; i++) {
+                        if (be32_to_cpup(p++)) {
+                                status = nfserr_inval;
+                                goto out;
+                        }
+                }
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutget *lgp)
+{
+        DECODE_HEAD;
+        READ_BUF(36);
+        lgp->lg_signal = be32_to_cpup(p++);
+        lgp->lg_layout_type = be32_to_cpup(p++);
+        lgp->lg_seg.iomode = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+        p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+        p = xdr_decode_hyper(p, &lgp->lg_minlength);
+        nfsd4_decode_stateid(argp, &lgp->lg_sid);
+        READ_BUF(4);
+        lgp->lg_maxcount = be32_to_cpup(p++);
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutcommit *lcp)
+{
+        DECODE_HEAD;
+        u32 timechange;
+        READ_BUF(20);
+        p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+        p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+        lcp->lc_reclaim = be32_to_cpup(p++);
+        nfsd4_decode_stateid(argp, &lcp->lc_sid);
+        READ_BUF(4);
+        lcp->lc_newoffset = be32_to_cpup(p++);
+        if (lcp->lc_newoffset) {
+                READ_BUF(8);
+                p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+        } else
+                lcp->lc_last_wr = 0;
+        READ_BUF(4);
+        timechange = be32_to_cpup(p++);
+        if (timechange) {
+                status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+                if (status)
+                        return status;
+        } else {
+                lcp->lc_mtime.tv_nsec = UTIME_NOW;
+        }
+        READ_BUF(8);
+        lcp->lc_layout_type = be32_to_cpup(p++);
+        /*
+         * Save the layout update in XDR format and let the layout driver deal
+         * with it later.
+         */
+        lcp->lc_up_len = be32_to_cpup(p++);
+        if (lcp->lc_up_len > 0) {
+                READ_BUF(lcp->lc_up_len);
+                READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutreturn *lrp)
+{
+        DECODE_HEAD;
+        READ_BUF(16);
+        lrp->lr_reclaim = be32_to_cpup(p++);
+        lrp->lr_layout_type = be32_to_cpup(p++);
+        lrp->lr_seg.iomode = be32_to_cpup(p++);
+        lrp->lr_return_type = be32_to_cpup(p++);
+        if (lrp->lr_return_type == RETURN_FILE) {
+                READ_BUF(16);
+                p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+                p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+                nfsd4_decode_stateid(argp, &lrp->lr_sid);
+                READ_BUF(4);
+                lrp->lrf_body_len = be32_to_cpup(p++);
+                if (lrp->lrf_body_len > 0) {
+                        READ_BUF(lrp->lrf_body_len);
+                        READMEM(lrp->lrf_body, lrp->lrf_body_len);
+                }
+        } else {
+                lrp->lr_seg.offset = 0;
+                lrp->lr_seg.length = NFS4_MAX_UINT64;
+        }
+        DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
                       struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
        [OP_FREE_STATEID]       = (nfsd4_dec)nfsd4_decode_free_stateid,
        [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_layoutcommit,
+        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_layoutget,
+        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
                        get_parent_attributes(exp, &stat);
                p = xdr_encode_hyper(p, stat.ino);
        }
+#ifdef CONFIG_NFSD_PNFS
+        if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+            (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+                if (exp->ex_layout_type) {
+                        p = xdr_reserve_space(xdr, 8);
+                        if (!p)
+                                goto out_resource;
+                        *p++ = cpu_to_be32(1);
+                        *p++ = cpu_to_be32(exp->ex_layout_type);
+                } else {
+                        p = xdr_reserve_space(xdr, 4);
+                        if (!p)
+                                goto out_resource;
+                        *p++ = cpu_to_be32(0);
+                }
+        }
+        if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+                p = xdr_reserve_space(xdr, 4);
+                if (!p)
+                        goto out_resource;
+                *p++ = cpu_to_be32(stat.blksize);
+        }
+#endif /* CONFIG_NFSD_PNFS */
        if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
                status = nfsd4_encode_security_label(xdr, rqstp, context,
                                                                contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        if (entry_bytes > cd->rd_maxcount)
                goto fail;
        cd->rd_maxcount -= entry_bytes;
-        if (!cd->rd_dircount)
-                goto fail;
        /*
         * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
         * let's always let through the first entry, at least:
         */
-        name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+        if (!cd->rd_dircount)
+                goto fail;
+        name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
        if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
                goto fail;
        cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
        cd->cookie_offset = cookie_offset;
 skip_entry:
        cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
        return nfserr;
 }
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_getdeviceinfo *gdev)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        const struct nfsd4_layout_ops *ops =
+                nfsd4_layout_ops[gdev->gd_layout_type];
+        u32 starting_len = xdr->buf->len, needed_len;
+        __be32 *p;
+        dprintk("%s: err %d\n", __func__, nfserr);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_resource;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                goto out;
+        *p++ = cpu_to_be32(gdev->gd_layout_type);
+        /* If maxcount is 0 then just update notifications */
+        if (gdev->gd_maxcount != 0) {
+                nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+                if (nfserr) {
+                        /*
+                         * We don't bother to burden the layout drivers with
+                         * enforcing gd_maxcount, just tell the client to
+                         * come back with a bigger buffer if it's not enough.
+                         */
+                        if (xdr->buf->len + 4 > gdev->gd_maxcount)
+                                goto toosmall;
+                        goto out;
+                }
+        }
+        nfserr = nfserr_resource;
+        if (gdev->gd_notify_types) {
+                p = xdr_reserve_space(xdr, 4 + 4);
+                if (!p)
+                        goto out;
+                *p++ = cpu_to_be32(1);                  /* bitmap length */
+                *p++ = cpu_to_be32(gdev->gd_notify_types);
+        } else {
+                p = xdr_reserve_space(xdr, 4);
+                if (!p)
+                        goto out;
+                *p++ = 0;
+        }
+        nfserr = 0;
+out:
+        kfree(gdev->gd_device);
+        dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+        return nfserr;
+toosmall:
+        dprintk("%s: maxcount too small\n", __func__);
+        needed_len = xdr->buf->len + 4 /* notifications */;
+        xdr_truncate_encode(xdr, starting_len);
+        p = xdr_reserve_space(xdr, 4);
+        if (!p) {
+                nfserr = nfserr_resource;
+        } else {
+                *p++ = cpu_to_be32(needed_len);
+                nfserr = nfserr_toosmall;
+        }
+        goto out;
+}
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_layoutget *lgp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        const struct nfsd4_layout_ops *ops =
+                nfsd4_layout_ops[lgp->lg_layout_type];
+        __be32 *p;
+        dprintk("%s: err %d\n", __func__, nfserr);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_resource;
+        p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+        if (!p)
+                goto out;
+        *p++ = cpu_to_be32(1);  /* we always set return-on-close */
+        *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+        p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+                                    sizeof(stateid_opaque_t));
+        *p++ = cpu_to_be32(1);  /* we always return a single layout */
+        p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+        p = xdr_encode_hyper(p, lgp->lg_seg.length);
+        *p++ = cpu_to_be32(lgp->lg_seg.iomode);
+        *p++ = cpu_to_be32(lgp->lg_layout_type);
+        nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+        kfree(lgp->lg_content);
+        return nfserr;
+}
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+                          struct nfsd4_layoutcommit *lcp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        __be32 *p;
+        if (nfserr)
+                return nfserr;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                return nfserr_resource;
+        *p++ = cpu_to_be32(lcp->lc_size_chg);
+        if (lcp->lc_size_chg) {
+                p = xdr_reserve_space(xdr, 8);
+                if (!p)
+                        return nfserr_resource;
+                p = xdr_encode_hyper(p, lcp->lc_newsize);
+        }
+        return nfs_ok;
+}
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        __be32 *p;
+        if (nfserr)
+                return nfserr;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                return nfserr_resource;
+        *p++ = cpu_to_be32(lrp->lrs_present);
+        if (lrp->lrs_present)
+                nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+        return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
                  struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_noop,
        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_layoutcommit,
+        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_layoutget,
+        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
+#endif
        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
 #include "cache.h"
 #include "state.h"
 #include "netns.h"
+#include "pnfs.h"
 /*
 *      We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
        retval = nfsd4_init_slabs();
        if (retval)
                goto out_unregister_pernet;
-        retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+        retval = nfsd4_init_pnfs();
        if (retval)
                goto out_free_slabs;
+        retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+        if (retval)
+                goto out_exit_pnfs;
        nfsd_stat_init();       /* Statistics */
        retval = nfsd_reply_cache_init();
        if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
 out_free_stat:
        nfsd_stat_shutdown();
        nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+        nfsd4_exit_pnfs();
 out_free_slabs:
        nfsd4_free_slabs();
 out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
        nfsd_stat_shutdown();
        nfsd_lockd_shutdown();
        nfsd4_free_slabs();
+        nfsd4_exit_pnfs();
        nfsd_fault_inject_cleanup();
        unregister_filesystem(&nfsd_fs_type);
        unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void		nfsd_lockd_shutdown(void);
 #define NFSD4_SUPPORTED_ATTRS_WORD2 0
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1     FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE    | FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1     0
+#define PNFSD_SUPPORTED_ATTRS_WORD2     0
+#endif /* CONFIG_NFSD_PNFS */
 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
        NFSD4_SUPPORTED_ATTRS_WORD0
 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-        NFSD4_SUPPORTED_ATTRS_WORD1
+        (NFSD4_SUPPORTED_ATTRS_WORD1    | PNFSD_SUPPORTED_ATTRS_WORD1)
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-        (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+        (NFSD4_SUPPORTED_ATTRS_WORD2    | PNFSD_SUPPORTED_ATTRS_WORD2 | \
+         FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+/* 4.2 */
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #define NFSD4_2_SECURITY_ATTRS          FATTR4_WORD2_SECURITY_LABEL
 #else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..f22920442172 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
        return fhp;
 }
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+        if (fh1->fh_size != fh2->fh_size)
+                return false;
+        if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+                return false;
+        return true;
+}
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+        if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+                return false;
+        if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
+                return false;
+        return true;
+}
 #ifdef CONFIG_NFSD_V3
 /*
 * The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program		nfsd_program = {
 static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
        [0] = 1,
        [1] = 1,
+        [2] = 1,
 };
 int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..d4c4453674c6
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,86 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+#ifdef CONFIG_NFSD_V4
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+#include "state.h"
+#include "xdr4.h"
+struct xdr_stream;
+struct nfsd4_deviceid_map {
+        struct list_head        hash;
+        u64                     idx;
+        int                     fsid_type;
+        u32                     fsid[];
+};
+struct nfsd4_layout_ops {
+        u32             notify_types;
+        __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+                        struct nfsd4_getdeviceinfo *gdevp);
+        __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+                        struct nfsd4_getdeviceinfo *gdevp);
+        __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+                        struct nfsd4_layoutget *lgp);
+        __be32 (*encode_layoutget)(struct xdr_stream *,
+                        struct nfsd4_layoutget *lgp);
+        __be32 (*proc_layoutcommit)(struct inode *inode,
+                        struct nfsd4_layoutcommit *lcp);
+};
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+extern const struct nfsd4_layout_ops bl_layout_ops;
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate, stateid_t *stateid,
+                bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+                struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+                u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+#endif /* CONFIG_NFSD_V4 */
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+                struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+struct nfs4_client;
+struct nfs4_file;
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+                struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+        return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
        unsigned char sc_type;
        stateid_t sc_stateid;
        struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
        struct list_head        cl_delegations;
        struct list_head        cl_revoked;     /* unacknowledged, revoked 4.1 state */
        struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+        struct list_head        cl_lo_states;   /* outstanding layout states */
+#endif
        struct xdr_netobj       cl_name;        /* id generated by client */
        nfs4_verifier           cl_verifier;    /* generated by client */
        time_t                  cl_time;        /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
        atomic_t                fi_access[2];
        u32                     fi_share_deny;
        struct file             *fi_deleg_file;
-        atomic_t                fi_delegees;
+        int                     fi_delegees;
        struct knfsd_fh         fi_fhandle;
        bool                    fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+        struct list_head        fi_lo_states;
+        atomic_t                fi_lo_recalls;
+#endif
 };
 /*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
        return container_of(s, struct nfs4_ol_stateid, st_stid);
 }
+struct nfs4_layout_stateid {
+        struct nfs4_stid                ls_stid;
+        struct list_head                ls_perclnt;
+        struct list_head                ls_perfile;
+        spinlock_t                      ls_lock;
+        struct list_head                ls_layouts;
+        u32                             ls_layout_type;
+        struct file                     *ls_file;
+        struct nfsd4_callback           ls_recall;
+        stateid_t                       ls_recall_sid;
+        bool                            ls_recalled;
+};
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+        return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
 /* flags for preprocess_seqid_op() */
 #define RD_STATE                0x00000010
 #define WR_STATE                0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 enum nfsd4_cb_op {
        NFSPROC4_CLNT_CB_NULL = 0,
        NFSPROC4_CLNT_CB_RECALL,
+        NFSPROC4_CLNT_CB_LAYOUT,
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
@@ -545,6 +572,12 @@ struct nfsd_net;
 extern __be32 nfs4_preprocess_stateid_op(struct net *net,
                struct nfsd4_compound_state *cstate,
                stateid_t *stateid, int flags, struct file **filp);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+                     stateid_t *stateid, unsigned char typemask,
+                     struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+                struct kmem_cache *slab);
+void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
                                                        struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+        atomic_inc(&fi->fi_ref);
+}
+struct file *find_any_file(struct nfs4_file *f);
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
+#include "state.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+#include <linux/tracepoint.h>
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+        TP_PROTO(stateid_t *stp),
+        TP_ARGS(stp),
+        TP_STRUCT__entry(
+                __field(u32, cl_boot)
+                __field(u32, cl_id)
+                __field(u32, si_id)
+                __field(u32, si_generation)
+        ),
+        TP_fast_assign(
+                __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+                __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+                __entry->si_id = stp->si_opaque.so_id;
+                __entry->si_generation = stp->si_generation;
+        ),
+        TP_printk("client %08x:%08x stateid %08x:%08x",
+                __entry->cl_boot,
+                __entry->cl_id,
+                __entry->si_id,
+                __entry->si_generation)
+)
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, name, \
+        TP_PROTO(stateid_t *stp), \
+        TP_ARGS(stp))
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+#endif /* _NFSD_TRACE_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
        u32 rca_one_fs;
 };
+struct nfsd4_deviceid {
+        u64                     fsid_idx;
+        u32                     generation;
+        u32                     pad;
+};
+struct nfsd4_layout_seg {
+        u32                     iomode;
+        u64                     offset;
+        u64                     length;
+};
+struct nfsd4_getdeviceinfo {
+        struct nfsd4_deviceid   gd_devid;       /* request */
+        u32                     gd_layout_type; /* request */
+        u32                     gd_maxcount;    /* request */
+        u32                     gd_notify_types;/* request - response */
+        void                    *gd_device;     /* response */
+};
+struct nfsd4_layoutget {
+        u64                     lg_minlength;   /* request */
+        u32                     lg_signal;      /* request */
+        u32                     lg_layout_type; /* request */
+        u32                     lg_maxcount;    /* request */
+        stateid_t               lg_sid;         /* request/response */
+        struct nfsd4_layout_seg lg_seg;         /* request/response */
+        void                    *lg_content;    /* response */
+};
+struct nfsd4_layoutcommit {
+        stateid_t               lc_sid;         /* request */
+        struct nfsd4_layout_seg lc_seg;         /* request */
+        u32                     lc_reclaim;     /* request */
+        u32                     lc_newoffset;   /* request */
+        u64                     lc_last_wr;     /* request */
+        struct timespec         lc_mtime;       /* request */
+        u32                     lc_layout_type; /* request */
+        u32                     lc_up_len;      /* layout length */
+        void                    *lc_up_layout;  /* decoded by callback */
+        u32                     lc_size_chg;    /* boolean for response */
+        u64                     lc_newsize;     /* response */
+};
+struct nfsd4_layoutreturn {
+        u32                     lr_return_type; /* request */
+        u32                     lr_layout_type; /* request */
+        struct nfsd4_layout_seg lr_seg;         /* request */
+        u32                     lr_reclaim;     /* request */
+        u32                     lrf_body_len;   /* request */
+        void                    *lrf_body;      /* request */
+        stateid_t               lr_sid;         /* request/response */
+        u32                     lrs_present;    /* response */
+};
 struct nfsd4_fallocate {
        /* request */
        stateid_t       falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
                struct nfsd4_reclaim_complete   reclaim_complete;
                struct nfsd4_test_stateid       test_stateid;
                struct nfsd4_free_stateid       free_stateid;
+                struct nfsd4_getdeviceinfo      getdeviceinfo;
+                struct nfsd4_layoutget          layoutget;
+                struct nfsd4_layoutcommit       layoutcommit;
+                struct nfsd4_layoutreturn       layoutreturn;
                /* NFSv4.2 */
                struct nfsd4_fallocate          allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
 #define NFS4_dec_cb_recall_sz           (cb_compound_dec_hdr_sz  +      \
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
+#define NFS4_enc_cb_layout_sz           (cb_compound_enc_hdr_sz +       \
+                                        cb_sequence_enc_sz +            \
+                                        1 + 3 +                         \
+                                        enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz           (cb_compound_dec_hdr_sz  +      \
+                                        cb_sequence_dec_sz +            \
+                                        op_dec_sz)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = nilfs_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
        inode->i_mapping->a_ops = &empty_aops;
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        ii->i_flags = 0;
        nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        inode->i_op = &def_mdt_iops;
        inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
                               struct nilfs_shadow_map *shadow)
 {
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
        INIT_LIST_HEAD(&shadow->frozen_buffers);
        address_space_init_once(&shadow->frozen_data);
-        nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+        nilfs_mapping_init(&shadow->frozen_data, inode);
        address_space_init_once(&shadow->frozen_btnodes);
-        nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+        nilfs_mapping_init(&shadow->frozen_btnodes, inode);
        mi->mi_shadow = shadow;
        return 0;
 }
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
-                        struct backing_dev_info *bdi)
 {
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = bdi;
        mapping->a_ops = &empty_aops;
 }
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-                        struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
                                            sector_t start_blk,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e5b3ec85b8f..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        ii->i_state = 0;
        ii->i_cno = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+        nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
        return &ii->vfs_inode;
 }
@@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct the_nilfs *nilfs;
        struct nilfs_root *fsroot;
-        struct backing_dev_info *bdi;
        __u64 cno;
        int err;
@@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_time_gran = 1;
        sb->s_max_links = NILFS_LINK_MAX;
-        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+        sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
-        sb->s_bdi = bdi ? : &default_backing_dev_info;
        err = load_nilfs(nilfs, sb);
        if (err)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..2a24249b30af 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,5 +1,6 @@
 config FSNOTIFY
        def_bool n
+        select SRCU
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 30d3addfad75..51ceb8107284 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
        }
        if (S_ISDIR(path->dentry->d_inode->i_mode) &&
-            (marks_ignored_mask & FS_ISDIR))
+            !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
                return false;
        if (event_mask & marks_mask & ~marks_ignored_mask)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index bff8567aa42d..cf275500a665 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -487,20 +487,27 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
                                            unsigned int flags,
                                            int *destroy)
 {
-        __u32 oldmask;
+        __u32 oldmask = 0;
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                __u32 tmask = fsn_mark->mask & ~mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask &= ~FAN_ONDIR;
                oldmask = fsn_mark->mask;
-                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+                fsnotify_set_mark_mask_locked(fsn_mark, tmask);
        } else {
-                oldmask = fsn_mark->ignored_mask;
+                __u32 tmask = fsn_mark->ignored_mask & ~mask;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+                if (flags & FAN_MARK_ONDIR)
+                        tmask &= ~FAN_ONDIR;
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
        }
+        *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
        spin_unlock(&fsn_mark->lock);
-        *destroy = !(oldmask & ~mask);
        return mask & oldmask;
 }
@@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                __u32 tmask = fsn_mark->mask | mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask |= FAN_ONDIR;
                oldmask = fsn_mark->mask;
-                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+                fsnotify_set_mark_mask_locked(fsn_mark, tmask);
        } else {
                __u32 tmask = fsn_mark->ignored_mask | mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask |= FAN_ONDIR;
                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
                if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
                        fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
        }
-        if (!(flags & FAN_MARK_ONDIR)) {
-                __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
-        }
        spin_unlock(&fsn_mark->lock);
        return mask & ~oldmask;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
        count = iov_length(iov, nr_segs);
        pos = *ppos;
        /* We can write back this queue in page reclaim. */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        written = 0;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7e8282dcea2a..c58a1bcfda0f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle,
                        ret = posix_acl_equiv_mode(acl, &mode);
                        if (ret < 0)
                                return ret;
-                        else {
-                                if (ret == 0)
-                                        acl = NULL;
-                                ret = ocfs2_acl_set_mode(inode, di_bh,
+                        if (ret == 0)
-                                                         handle, mode);
+                                acl = NULL;
-                                if (ret)
-                                        return ret;
-                        }
+                        ret = ocfs2_acl_set_mode(inode, di_bh,
+                                                 handle, mode);
+                        if (ret)
+                                return ret;
                }
                break;
        case ACL_TYPE_DEFAULT:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fcae9ef1a328..044158bd22be 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6873,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_unlock;
+                goto out;
        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
@@ -6931,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (ret) {
                        mlog_errno(ret);
                        need_free = 1;
-                        goto out_commit;
+                        goto out_unlock;
                }
                page_end = PAGE_CACHE_SIZE;
@@ -6964,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (ret) {
                        mlog_errno(ret);
                        need_free = 1;
-                        goto out_commit;
+                        goto out_unlock;
                }
                inode->i_blocks = ocfs2_inode_sector_count(inode);
        }
+out_unlock:
+        if (pages)
+                ocfs2_unlock_and_free_pages(pages, num_pages);
 out_commit:
        if (ret < 0 && did_quota)
                dquot_free_space_nodirty(inode,
@@ -6989,15 +6993,11 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
-out_unlock:
+out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (pages)
-out:
-        if (pages) {
-                ocfs2_unlock_and_free_pages(pages, num_pages);
                kfree(pages);
-        }
        return ret;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 46d93e941f3d..44db1808cdb5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -28,6 +28,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 #include <cluster/masklog.h>
@@ -47,6 +48,9 @@
 #include "ocfs2_trace.h"
 #include "buffer_head_io.h"
+#include "dir.h"
+#include "namei.h"
+#include "sysfile.h"
 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
@@ -506,18 +510,21 @@ bail:
 *
 * called like this: dio->get_blocks(dio->inode, fs_startblk,
 *                                      fs_count, map_bh, dio->rw == WRITE);
- *
- * Note that we never bother to allocate blocks here, and thus ignore the
- * create argument.
 */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
 {
        int ret;
+        u32 cpos = 0;
+        int alloc_locked = 0;
        u64 p_blkno, inode_blocks, contig_blocks;
        unsigned int ext_flags;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+        unsigned long len = bh_result->b_size;
+        unsigned int clusters_to_alloc = 0;
+        cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
        /* This function won't even be called if the request isn't all
         * nicely aligned and of the right size, so there's no need
@@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        /* We should already CoW the refcounted extent in case of create. */
        BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+        /* allocate blocks if no p_blkno is found, and create == 1 */
+        if (!p_blkno && create) {
+                ret = ocfs2_inode_lock(inode, NULL, 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto bail;
+                }
+                alloc_locked = 1;
+                /* fill hole, allocate blocks can't be larger than the size
+                 * of the hole */
+                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
+                if (clusters_to_alloc > contig_blocks)
+                        clusters_to_alloc = contig_blocks;
+                /* allocate extent and insert them into the extent tree */
+                ret = ocfs2_extend_allocation(inode, cpos,
+                                clusters_to_alloc, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto bail;
+                }
+                ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+                                &contig_blocks, &ext_flags);
+                if (ret < 0) {
+                        mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+                                        (unsigned long long)iblock);
+                        ret = -EIO;
+                        goto bail;
+                }
+        }
        /*
         * get_more_blocks() expects us to describe a hole by clearing
         * the mapped bit on bh_result().
@@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                contig_blocks = max_blocks;
        bh_result->b_size = contig_blocks << blocksize_bits;
 bail:
+        if (alloc_locked)
+                ocfs2_inode_unlock(inode, 1);
        return ret;
 }
@@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
        return try_to_free_buffers(page);
 }
+static int ocfs2_is_overwrite(struct ocfs2_super *osb,
+                struct inode *inode, loff_t offset)
+{
+        int ret = 0;
+        u32 v_cpos = 0;
+        u32 p_cpos = 0;
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+                        &num_clusters, &ext_flags);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                return 1;
+        return 0;
+}
+static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
+                struct iov_iter *iter,
+                loff_t offset)
+{
+        ssize_t ret = 0;
+        ssize_t written = 0;
+        bool orphaned = false;
+        int is_overwrite = 0;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file_inode(file)->i_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        size_t count = iter->count;
+        journal_t *journal = osb->journal->j_journal;
+        u32 zero_len;
+        int cluster_align;
+        loff_t final_size = offset + count;
+        int append_write = offset >= i_size_read(inode) ? 1 : 0;
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        {
+                u64 o = offset;
+                zero_len = do_div(o, 1 << osb->s_clustersize_bits);
+                cluster_align = !zero_len;
+        }
+        /*
+         * when final_size > inode->i_size, inode->i_size will be
+         * updated after direct write, so add the inode to orphan
+         * dir first.
+         */
+        if (final_size > i_size_read(inode)) {
+                ret = ocfs2_add_inode_to_orphan(osb, inode);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                orphaned = true;
+        }
+        if (append_write) {
+                ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto clean_orphan;
+                }
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                        ret = ocfs2_zero_extend(inode, di_bh, offset);
+                else
+                        ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+                                        offset);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        ocfs2_inode_unlock(inode, 1);
+                        brelse(di_bh);
+                        goto clean_orphan;
+                }
+                is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
+                if (is_overwrite < 0) {
+                        mlog_errno(is_overwrite);
+                        ocfs2_inode_unlock(inode, 1);
+                        brelse(di_bh);
+                        goto clean_orphan;
+                }
+                ocfs2_inode_unlock(inode, 1);
+                brelse(di_bh);
+                di_bh = NULL;
+        }
+        written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
+                        iter, offset,
+                        ocfs2_direct_IO_get_blocks,
+                        ocfs2_dio_end_io, NULL, 0);
+        if (unlikely(written < 0)) {
+                loff_t i_size = i_size_read(inode);
+                if (offset + count > i_size) {
+                        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto clean_orphan;
+                        }
+                        if (i_size == i_size_read(inode)) {
+                                ret = ocfs2_truncate_file(inode, di_bh,
+                                                i_size);
+                                if (ret < 0) {
+                                        if (ret != -ENOSPC)
+                                                mlog_errno(ret);
+                                        ocfs2_inode_unlock(inode, 1);
+                                        brelse(di_bh);
+                                        goto clean_orphan;
+                                }
+                        }
+                        ocfs2_inode_unlock(inode, 1);
+                        brelse(di_bh);
+                        ret = jbd2_journal_force_commit(journal);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                }
+        } else if (written < 0 && append_write && !is_overwrite &&
+                        !cluster_align) {
+                u32 p_cpos = 0;
+                u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+                                &num_clusters, &ext_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto clean_orphan;
+                }
+                BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
+                ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+                                p_cpos << (osb->s_clustersize_bits - 9),
+                                zero_len >> 9, GFP_KERNEL, false);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+clean_orphan:
+        if (orphaned) {
+                int tmp_ret;
+                int update_isize = written > 0 ? 1 : 0;
+                loff_t end = update_isize ? offset + written : 0;
+                tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+                                update_isize, end);
+                if (tmp_ret < 0) {
+                        ret = tmp_ret;
+                        goto out;
+                }
+                tmp_ret = jbd2_journal_force_commit(journal);
+                if (tmp_ret < 0) {
+                        ret = tmp_ret;
+                        mlog_errno(tmp_ret);
+                }
+        }
+out:
+        if (ret >= 0)
+                ret = written;
+        return ret;
+}
 static ssize_t ocfs2_direct_IO(int rw,
                               struct kiocb *iocb,
                               struct iov_iter *iter,
@@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file)->i_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int full_coherency = !(osb->s_mount_opt &
+                        OCFS2_MOUNT_COHERENCY_BUFFERED);
        /*
         * Fallback to buffered I/O if we see an inode without
@@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw,
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
-        /* Fallback to buffered I/O if we are appending. */
+        /* Fallback to buffered I/O if we are appending and
-        if (i_size_read(inode) <= offset)
+         * concurrent O_DIRECT writes are allowed.
+         */
+        if (i_size_read(inode) <= offset && !full_coherency)
                return 0;
-        return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+        if (rw == READ)
+                return __blockdev_direct_IO(rw, iocb, inode,
+                                    inode->i_sb->s_bdev,
                                    iter, offset,
                                    ocfs2_direct_IO_get_blocks,
                                    ocfs2_dio_end_io, NULL, 0);
+        else
+                return ocfs2_direct_IO_write(iocb, iter, offset);
 }
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2e355e0f8335..56c403a563bc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes)
        memset(map, 0, bytes);
        for (node = 0; node < O2NM_MAX_NODES; ++node) {
-                o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+                if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
+                        continue;
                if (!ret) {
                        set_bit(node, map);
                        sc_put(sc);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index dc024367110a..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -107,12 +107,12 @@ struct o2net_node {
        struct list_head                nn_status_list;
        /* connects are attempted from when heartbeat comes up until either hb
-         * goes down, the node is unconfigured, no connect attempts succeed
+         * goes down, the node is unconfigured, or a connect succeeds.
-         * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
+         * connect_work is queued from set_nn_state both from hb up and from
-         * is queued from set_nn_state both from hb up and from itself if a
+         * itself if a connect attempt fails and so can be self-arming.
-         * connect attempt fails and so can be self-arming.  shutdown is
+         * shutdown is careful to first mark the nn such that no connects will
-         * careful to first mark the nn such that no connects will be attempted
+         * be attempted before canceling delayed connect work and flushing the
-         * before canceling delayed connect work and flushing the queue. */
+         * queue. */
        struct delayed_work             nn_connect_work;
        unsigned long                   nn_last_connect_attempt;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 319e786175af..b08050bd3f2e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3456,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        int blocksize = dir->i_sb->s_blocksize;
        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-        if (status) {
+        if (status)
-                mlog_errno(status);
                goto bail;
-        }
        rec_len = OCFS2_DIR_REC_LEN(namelen);
        offset = 0;
@@ -3480,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = ocfs2_read_dir_block(dir,
                                             offset >> sb->s_blocksize_bits,
                                             &bh, 0);
-                        if (status) {
+                        if (status)
-                                mlog_errno(status);
                                goto bail;
-                        }
                        /* move to next block */
                        de = (struct ocfs2_dir_entry *) bh->b_data;
                }
@@ -3513,7 +3510,6 @@ next:
                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
        }
-        status = 0;
 bail:
        brelse(bh);
        if (status)
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index b46278f9ae44..fd6bbbbd7d78 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
                head = &res->granted;
        list_for_each_entry(lock, head, list) {
-                if (lock->ml.cookie == cookie)
+                /* if lock is found but unlock is pending ignore the bast */
+                if (lock->ml.cookie == cookie) {
+                        if (lock->unlock_pending)
+                                break;
                        goto do_ast;
+                }
        }
        mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 149eb556b8c6..825136070d2c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+        out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
        return out;
 }
@@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
        spin_unlock(&dlm->master_lock);
        out += snprintf(buf + out, len - out,
-                        "Total: %ld, Longest: %ld\n", total, longest);
+                        "Total: %lu, Longest: %lu\n", total, longest);
        return out;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 50a59d2337b2..7df88a6dd626 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
        spin_unlock(&dlm->spinlock);
 }
-int dlm_joined(struct dlm_ctxt *dlm)
-{
-        int ret = 0;
-        spin_lock(&dlm_domain_lock);
-        if (dlm->dlm_state == DLM_CTXT_JOINED)
-                ret = 1;
-        spin_unlock(&dlm_domain_lock);
-        return ret;
-}
 int dlm_shutting_down(struct dlm_ctxt *dlm)
 {
        int ret = 0;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 2f7f60bfeb3b..fd6122a38dbd 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,6 @@
 extern spinlock_t dlm_domain_lock;
 extern struct list_head dlm_domains;
-int dlm_joined(struct dlm_ctxt *dlm);
 int dlm_shutting_down(struct dlm_ctxt *dlm);
 void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
                                        int node_num);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index cecd875653e4..ce12e0b1a31f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                                             dead_node, dlm->name);
                                        list_del_init(&lock->list);
                                        dlm_lock_put(lock);
+                                        /* Can't schedule DLM_UNLOCK_FREE_LOCK
+                                         * - do manually */
+                                        dlm_lock_put(lock);
                                        break;
                                }
                        }
@@ -2346,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                                                     dead_node, dlm->name);
                                                list_del_init(&lock->list);
                                                dlm_lock_put(lock);
+                                                /* Can't schedule
+                                                 * DLM_UNLOCK_FREE_LOCK
+                                                 * - do manually */
+                                                dlm_lock_put(lock);
                                                break;
                                        }
                                }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 57c40e34f56f..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -390,12 +390,6 @@ clear_fields:
        ip->ip_conn = NULL;
 }
-static struct backing_dev_info dlmfs_backing_dev_info = {
-        .name           = "ocfs2-dlmfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
        if (inode) {
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, NULL, mode);
-                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inc_nlink(inode);
@@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_ino = get_next_ino();
        inode_init_owner(inode, parent, mode);
-        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ip = DLMFS_I(inode);
@@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void)
        int status;
        int cleanup_inode = 0, cleanup_worker = 0;
-        status = bdi_init(&dlmfs_backing_dev_info);
-        if (status)
-                return status;
        dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
                                sizeof(struct dlmfs_inode_private),
                                0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -673,7 +661,6 @@ bail:
                        kmem_cache_destroy(dlmfs_inode_cache);
                if (cleanup_worker)
                        destroy_workqueue(user_dlm_worker);
-                bdi_destroy(&dlmfs_backing_dev_info);
        } else
                printk("OCFS2 User DLM kernel interface loaded\n");
        return status;
@@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void)
        rcu_barrier();
        kmem_cache_destroy(dlmfs_inode_cache);
-        bdi_destroy(&dlmfs_backing_dev_info);
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c423af04c69..11849a44dc5a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3750,6 +3750,9 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
                        break;
                spin_unlock(&dentry_attach_lock);
+                if (S_ISDIR(dl->dl_inode->i_mode))
+                        shrink_dcache_parent(dentry);
                mlog(0, "d_delete(%pd);\n", dentry);
                /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..46e0d4e857c7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -295,7 +295,7 @@ out:
        return ret;
 }
-static int ocfs2_set_inode_size(handle_t *handle,
+int ocfs2_set_inode_size(handle_t *handle,
                                struct inode *inode,
                                struct buffer_head *fe_bh,
                                u64 new_i_size)
@@ -441,7 +441,7 @@ out:
        return status;
 }
-static int ocfs2_truncate_file(struct inode *inode,
+int ocfs2_truncate_file(struct inode *inode,
                               struct buffer_head *di_bh,
                               u64 new_i_size)
 {
@@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
-        enum ocfs2_alloc_restarted why;
+        enum ocfs2_alloc_restarted why = RESTART_NONE;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        int did_quota = 0;
@@ -709,6 +709,13 @@ leave:
        return status;
 }
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+                u32 clusters_to_add, int mark_unwritten)
+{
+        return __ocfs2_extend_allocation(inode, logical_start,
+                        clusters_to_add, mark_unwritten);
+}
 /*
 * While a write will already be ordering the data, a truncate will not.
 * Thus, we need to explicitly order the zeroed pages.
@@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        loff_t saved_pos = 0, end;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int full_coherency = !(osb->s_mount_opt &
+                OCFS2_MOUNT_COHERENCY_BUFFERED);
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 * one node could wind up truncating another
                 * nodes writes.
                 */
-                if (end > i_size_read(inode)) {
+                if (end > i_size_read(inode) && !full_coherency) {
+                        *direct_io = 0;
+                        break;
+                }
+                /*
+                 * Fallback to old way if the feature bit is not set.
+                 */
+                if (end > i_size_read(inode) &&
+                                !ocfs2_supports_append_dio(osb)) {
                        *direct_io = 0;
                        break;
                }
@@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 */
                ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
                if (ret == 1) {
-                        *direct_io = 0;
+                        /*
+                         * Fallback to old way if the feature bit is not set.
+                         * Otherwise try dio first and then complete the rest
+                         * request through buffer io.
+                         */
+                        if (!ocfs2_supports_append_dio(osb))
+                                *direct_io = 0;
                        ret = 0;
                } else if (ret < 0)
                        mlog_errno(ret);
@@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        u32 old_clusters;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
+        struct address_space *mapping = file->f_mapping;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2357,13 +2383,53 @@ relock:
        iov_iter_truncate(from, count);
        if (direct_io) {
+                loff_t endbyte;
+                ssize_t written_buffered;
                written = generic_file_direct_write(iocb, from, *ppos);
-                if (written < 0) {
+                if (written < 0 || written == count) {
                        ret = written;
                        goto out_dio;
                }
+                /*
+                 * for completing the rest of the request.
+                 */
+                *ppos += written;
+                count -= written;
+                written_buffered = generic_perform_write(file, from, *ppos);
+                /*
+                 * If generic_file_buffered_write() returned a synchronous error
+                 * then we want to return the number of bytes which were
+                 * direct-written, or the error code if that was zero. Note
+                 * that this differs from normal direct-io semantics, which
+                 * will return -EFOO even if some bytes were written.
+                 */
+                if (written_buffered < 0) {
+                        ret = written_buffered;
+                        goto out_dio;
+                }
+                iocb->ki_pos = *ppos + written_buffered;
+                /* We need to ensure that the page cache pages are written to
+                 * disk and invalidated to preserve the expected O_DIRECT
+                 * semantics.
+                 */
+                endbyte = *ppos + written_buffered - 1;
+                ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
+                                endbyte);
+                if (ret == 0) {
+                        written += written_buffered;
+                        invalidate_mapping_pages(mapping,
+                                        *ppos >> PAGE_CACHE_SHIFT,
+                                        endbyte >> PAGE_CACHE_SHIFT);
+                } else {
+                        /*
+                         * We don't know how much we wrote, so just return
+                         * the number of bytes which were direct-written
+                         */
+                }
        } else {
-                current->backing_dev_info = file->f_mapping->backing_dev_info;
+                current->backing_dev_info = inode_to_bdi(inode);
                written = generic_perform_write(file, from, *ppos);
                if (likely(written >= 0))
                        iocb->ki_pos = *ppos + written;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..e8c62f22215c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
                         struct ocfs2_alloc_context *data_ac,
                         struct ocfs2_alloc_context *meta_ac,
                         enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_set_inode_size(handle_t *handle,
+                struct inode *inode,
+                struct buffer_head *fe_bh,
+                u64 new_i_size);
 int ocfs2_simple_size_update(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size);
+int ocfs2_truncate_file(struct inode *inode,
+                struct buffer_head *di_bh,
+                u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
                          u64 new_i_size, u64 zero_to);
 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
                      loff_t zero_to);
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+                u32 clusters_to_add, int mark_unwritten);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c8b25de9efbb..3025c0da6b8a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
                status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-                                          orphan_dir_bh);
+                                          orphan_dir_bh, false);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_commit;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..5e86b247c821 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,6 +81,8 @@ struct ocfs2_inode_info
        tid_t i_sync_tid;
        tid_t i_datasync_tid;
+        wait_queue_head_t append_dio_wq;
        struct dquot *i_dquot[MAXQUOTAS];
 };
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f502382180f..ff531928269e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -50,6 +50,8 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "quota.h"
+#include "file.h"
+#include "namei.h"
 #include "buffer_head_io.h"
 #include "ocfs2_trace.h"
@@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
                                 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-                                 int slot);
+                                 int slot,
+                                 enum ocfs2_orphan_reco_type orphan_reco_type);
 static int ocfs2_commit_thread(void *arg);
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
                                            struct ocfs2_dinode *tl_dinode,
-                                            struct ocfs2_quota_recovery *qrec);
+                                            struct ocfs2_quota_recovery *qrec,
+                                            enum ocfs2_orphan_reco_type orphan_reco_type);
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
        return 0;
 }
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+                enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_replay_map *replay_map = osb->replay_map;
        int i;
@@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
        for (i = 0; i < replay_map->rm_slots; i++)
                if (replay_map->rm_replay_slots[i])
                        ocfs2_queue_recovery_completion(osb->journal, i, NULL,
-                                                        NULL, NULL);
+                                                        NULL, NULL,
+                                                        orphan_reco_type);
        replay_map->rm_state = REPLAY_DONE;
 }
@@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item {
        struct ocfs2_dinode     *lri_la_dinode;
        struct ocfs2_dinode     *lri_tl_dinode;
        struct ocfs2_quota_recovery *lri_qrec;
+        enum ocfs2_orphan_reco_type  lri_orphan_reco_type;
 };
 /* Does the second half of the recovery process. By this point, the
@@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_dinode *la_dinode, *tl_dinode;
        struct ocfs2_la_recovery_item *item, *n;
        struct ocfs2_quota_recovery *qrec;
+        enum ocfs2_orphan_reco_type orphan_reco_type;
        LIST_HEAD(tmp_la_list);
        trace_ocfs2_complete_recovery(
@@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                la_dinode = item->lri_la_dinode;
                tl_dinode = item->lri_tl_dinode;
                qrec = item->lri_qrec;
+                orphan_reco_type = item->lri_orphan_reco_type;
                trace_ocfs2_complete_recovery_slot(item->lri_slot,
                        la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
                        kfree(tl_dinode);
                }
-                ret = ocfs2_recover_orphans(osb, item->lri_slot);
+                ret = ocfs2_recover_orphans(osb, item->lri_slot,
+                                orphan_reco_type);
                if (ret < 0)
                        mlog_errno(ret);
@@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
                                            struct ocfs2_dinode *tl_dinode,
-                                            struct ocfs2_quota_recovery *qrec)
+                                            struct ocfs2_quota_recovery *qrec,
+                                            enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_la_recovery_item *item;
@@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        item->lri_slot = slot_num;
        item->lri_tl_dinode = tl_dinode;
        item->lri_qrec = qrec;
+        item->lri_orphan_reco_type = orphan_reco_type;
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        /* No need to queue up our truncate_log as regular cleanup will catch
         * that */
        ocfs2_queue_recovery_completion(journal, osb->slot_num,
-                                        osb->local_alloc_copy, NULL, NULL);
+                                        osb->local_alloc_copy, NULL, NULL,
+                                        ORPHAN_NEED_TRUNCATE);
        ocfs2_schedule_truncate_log_flush(osb, 0);
        osb->local_alloc_copy = NULL;
@@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        /* queue to recover orphan slots for all offline slots */
        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
-        ocfs2_queue_replay_slots(osb);
+        ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
        ocfs2_free_replay_slots(osb);
 }
@@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
                                                osb->slot_num,
                                                NULL,
                                                NULL,
-                                                osb->quota_rec);
+                                                osb->quota_rec,
+                                                ORPHAN_NEED_TRUNCATE);
                osb->quota_rec = NULL;
        }
 }
@@ -1360,7 +1374,7 @@ restart:
        /* queue recovery for our own slot */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL, NULL);
+                                        NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
        spin_lock(&osb->osb_lock);
        while (rm->rm_used) {
@@ -1419,13 +1433,14 @@ skip_recovery:
                        continue;
                }
                ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
-                                                NULL, NULL, qrec);
+                                                NULL, NULL, qrec,
+                                                ORPHAN_NEED_TRUNCATE);
        }
        ocfs2_super_unlock(osb, 1);
        /* queue recovery for offline slots */
-        ocfs2_queue_replay_slots(osb);
+        ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1447,7 +1462,6 @@ bail:
         * requires that we call do_exit().  And it isn't exported, but
         * complete_and_exit() seems to be a minimal wrapper around it. */
        complete_and_exit(NULL, status);
-        return status;
 }
 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
@@ -1712,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* This will kfree the memory pointed to by la_copy and tl_copy */
        ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-                                        tl_copy, NULL);
+                                        tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
        status = 0;
 done:
@@ -1902,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
        for (i = 0; i < osb->max_slots; i++)
                ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
-                                                NULL);
+                                                NULL, ORPHAN_NO_NEED_TRUNCATE);
        /*
         * We queued a recovery on orphan slots, increment the sequence
         * number and update LVB so other node will skip the scan for a while
@@ -2001,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
        if (IS_ERR(iter))
                return 0;
+        /* Skip inodes which are already added to recover list, since dio may
+         * happen concurrently with unlink/rename */
+        if (OCFS2_I(iter)->ip_next_orphan) {
+                iput(iter);
+                return 0;
+        }
        trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
        /* No locking is required for the next_orphan queue as there
         * is only ever a single process doing orphan recovery. */
@@ -2109,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
 *   advertising our state to ocfs2_delete_inode().
 */
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-                                 int slot)
+                                 int slot,
+                                 enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        int ret = 0;
        struct inode *inode = NULL;
@@ -2133,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                        (unsigned long long)oi->ip_blkno);
                iter = oi->ip_next_orphan;
+                oi->ip_next_orphan = NULL;
+                /*
+                 * We need to take and drop the inode lock to
+                 * force read inode from disk.
+                 */
+                ret = ocfs2_inode_lock(inode, NULL, 0);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto next;
+                }
+                ocfs2_inode_unlock(inode, 0);
+                if (inode->i_nlink == 0) {
+                        spin_lock(&oi->ip_lock);
+                        /* Set the proper information to get us going into
+                         * ocfs2_delete_inode. */
+                        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+                        spin_unlock(&oi->ip_lock);
+                } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
+                        struct buffer_head *di_bh = NULL;
+                        ret = ocfs2_rw_lock(inode, 1);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto next;
+                        }
+                        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                        if (ret < 0) {
+                                ocfs2_rw_unlock(inode, 1);
+                                mlog_errno(ret);
+                                goto next;
+                        }
+                        ret = ocfs2_truncate_file(inode, di_bh,
+                                        i_size_read(inode));
+                        ocfs2_inode_unlock(inode, 1);
+                        ocfs2_rw_unlock(inode, 1);
+                        brelse(di_bh);
+                        if (ret < 0) {
+                                if (ret != -ENOSPC)
+                                        mlog_errno(ret);
+                                goto next;
+                        }
+                        ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+                        if (ret)
+                                mlog_errno(ret);
-                spin_lock(&oi->ip_lock);
+                        wake_up(&OCFS2_I(inode)->append_dio_wq);
-                /* Set the proper information to get us going into
+                } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-                 * ocfs2_delete_inode. */
-                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-                spin_unlock(&oi->ip_lock);
+next:
                iput(inode);
                inode = iter;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7f8cde94abfe..f4cd3c3e9fb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
 * orphan dir index leaf */
 #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
+/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
+ * orphan dir index root + orphan dir index leaf */
+#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
+#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS  OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
 * directory + target unlink + 3 x dir index leaves */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
        .fault          = ocfs2_fault,
        .page_mkwrite   = ocfs2_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 914c121ec890..b5c3a5ea3ee6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    u64 blkno,
                                    char *name,
-                                    struct ocfs2_dir_lookup_result *lookup);
+                                    struct ocfs2_dir_lookup_result *lookup,
+                                    bool dio);
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
@@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
-                            struct inode *orphan_dir_inode);
+                            struct inode *orphan_dir_inode,
+                            bool dio);
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                                     handle_t *handle,
@@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
                                   unsigned int flags)
@@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir,
        if (ocfs2_inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                  OCFS2_I(inode)->ip_blkno,
-                                                  orphan_name, &orphan_insert);
+                                                  orphan_name, &orphan_insert,
+                                                  false);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (is_unlinkable) {
                status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
-                                orphan_name, &orphan_insert, orphan_dir);
+                                orphan_name, &orphan_insert, orphan_dir, false);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir,
                if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
                        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                OCFS2_I(new_inode)->ip_blkno,
-                                                orphan_name, &orphan_insert);
+                                                orphan_name, &orphan_insert,
+                                                false);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir,
                if (should_add_orphan) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
                                        newfe_bh, orphan_name,
-                                        &orphan_insert, orphan_dir);
+                                        &orphan_insert, orphan_dir, false);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
                                      struct buffer_head *orphan_dir_bh,
                                      u64 blkno,
                                      char *name,
-                                      struct ocfs2_dir_lookup_result *lookup)
+                                      struct ocfs2_dir_lookup_result *lookup,
+                                      bool dio)
 {
        int ret;
        struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+        int namelen = dio ?
+                        (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+                        OCFS2_ORPHAN_NAMELEN;
+        if (dio) {
+                ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+                                OCFS2_DIO_ORPHAN_PREFIX);
+                if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+                        ret = -EINVAL;
+                        mlog_errno(ret);
+                        return ret;
+                }
-        ret = ocfs2_blkno_stringify(blkno, name);
+                ret = ocfs2_blkno_stringify(blkno,
+                                name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+        } else
+                ret = ocfs2_blkno_stringify(blkno, name);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
        ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
                                           orphan_dir_bh, name,
-                                           OCFS2_ORPHAN_NAMELEN, lookup);
+                                           namelen, lookup);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    u64 blkno,
                                    char *name,
-                                    struct ocfs2_dir_lookup_result *lookup)
+                                    struct ocfs2_dir_lookup_result *lookup,
+                                    bool dio)
 {
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
@@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        }
        ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
-                                         blkno, name, lookup);
+                                         blkno, name, lookup, dio);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
-                            struct inode *orphan_dir_inode)
+                            struct inode *orphan_dir_inode,
+                            bool dio)
 {
        struct buffer_head *orphan_dir_bh = NULL;
        int status = 0;
        struct ocfs2_dinode *orphan_fe;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+        int namelen = dio ?
+                        (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+                        OCFS2_ORPHAN_NAMELEN;
        trace_ocfs2_orphan_add_begin(
                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, orphan_dir_bh);
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
-                                   OCFS2_ORPHAN_NAMELEN, inode,
+                                   namelen, inode,
                                   OCFS2_I(inode)->ip_blkno,
                                   orphan_dir_bh, lookup);
        if (status < 0) {
@@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                goto rollback;
        }
-        fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+        if (dio) {
-        OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+                /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
+                 * slot.
+                 */
+                fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+                fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
+        } else {
+                fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+                OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
-        /* Record which orphan dir our inode now resides
+                /* Record which orphan dir our inode now resides
-         * in. delete_inode will use this to determine which orphan
+                 * in. delete_inode will use this to determine which orphan
-         * dir to lock. */
+                 * dir to lock. */
-        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+                fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+        }
        ocfs2_journal_dirty(handle, fe_bh);
@@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                     handle_t *handle,
                     struct inode *orphan_dir_inode,
                     struct inode *inode,
-                     struct buffer_head *orphan_dir_bh)
+                     struct buffer_head *orphan_dir_bh,
+                     bool dio)
 {
-        char name[OCFS2_ORPHAN_NAMELEN + 1];
+        const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
+        char name[namelen + 1];
        struct ocfs2_dinode *orphan_fe;
        int status = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+        if (dio) {
+                status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+                                OCFS2_DIO_ORPHAN_PREFIX);
+                if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+                        status = -EINVAL;
+                        mlog_errno(status);
+                        return status;
+                }
+                status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
+                                name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+        } else
+                status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        trace_ocfs2_orphan_del(
             (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-             name, OCFS2_ORPHAN_NAMELEN);
+             name, namelen);
        /* find it's spot in the orphan directory */
-        status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+        status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
                                  &lookup);
        if (status) {
                mlog_errno(status);
@@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
        }
        ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
-                                         di_blkno, orphan_name, orphan_insert);
+                                         di_blkno, orphan_name, orphan_insert,
+                                         false);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        di = (struct ocfs2_dinode *)new_di_bh->b_data;
        status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
-                                  &orphan_insert, orphan_dir);
+                                  &orphan_insert, orphan_dir, false);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -2527,6 +2577,186 @@ leave:
        return status;
 }
+static int ocfs2_dio_orphan_recovered(struct inode *inode)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return 0;
+        }
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+        return ret;
+}
+#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+        struct inode *inode)
+{
+        char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
+        struct inode *orphan_dir_inode = NULL;
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+        struct buffer_head *di_bh = NULL;
+        int status = 0;
+        handle_t *handle = NULL;
+        struct ocfs2_dinode *di = NULL;
+restart:
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        /*
+         * Another append dio crashed?
+         * If so, wait for recovery first.
+         */
+        if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+                ocfs2_inode_unlock(inode, 1);
+                brelse(di_bh);
+                wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
+                                ocfs2_dio_orphan_recovered(inode),
+                                msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
+                goto restart;
+        }
+        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
+                        OCFS2_I(inode)->ip_blkno,
+                        orphan_name,
+                        &orphan_insert,
+                        true);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail_unlock_inode;
+        }
+        handle = ocfs2_start_trans(osb,
+                        OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                goto bail_unlock_orphan;
+        }
+        status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
+                        &orphan_insert, orphan_dir_inode, true);
+        if (status)
+                mlog_errno(status);
+        ocfs2_commit_trans(osb, handle);
+bail_unlock_orphan:
+        ocfs2_inode_unlock(orphan_dir_inode, 1);
+        mutex_unlock(&orphan_dir_inode->i_mutex);
+        iput(orphan_dir_inode);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+bail_unlock_inode:
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+bail:
+        return status;
+}
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+                struct inode *inode, int update_isize,
+                loff_t end)
+{
+        struct inode *orphan_dir_inode = NULL;
+        struct buffer_head *orphan_dir_bh = NULL;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = NULL;
+        handle_t *handle = NULL;
+        int status = 0;
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+                        ORPHAN_DIR_SYSTEM_INODE,
+                        le16_to_cpu(di->i_dio_orphaned_slot));
+        if (!orphan_dir_inode) {
+                status = -ENOENT;
+                mlog_errno(status);
+                goto bail_unlock_inode;
+        }
+        mutex_lock(&orphan_dir_inode->i_mutex);
+        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        if (status < 0) {
+                mutex_unlock(&orphan_dir_inode->i_mutex);
+                iput(orphan_dir_inode);
+                mlog_errno(status);
+                goto bail_unlock_inode;
+        }
+        handle = ocfs2_start_trans(osb,
+                        OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                goto bail_unlock_orphan;
+        }
+        BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
+        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
+                                inode, orphan_dir_bh, true);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail_commit;
+        }
+        status = ocfs2_journal_access_di(handle,
+                        INODE_CACHE(inode),
+                        di_bh,
+                        OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail_commit;
+        }
+        di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+        di->i_dio_orphaned_slot = 0;
+        if (update_isize) {
+                status = ocfs2_set_inode_size(handle, inode, di_bh, end);
+                if (status)
+                        mlog_errno(status);
+        } else
+                ocfs2_journal_dirty(handle, di_bh);
+bail_commit:
+        ocfs2_commit_trans(osb, handle);
+bail_unlock_orphan:
+        ocfs2_inode_unlock(orphan_dir_inode, 1);
+        mutex_unlock(&orphan_dir_inode->i_mutex);
+        brelse(orphan_dir_bh);
+        iput(orphan_dir_inode);
+bail_unlock_inode:
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+bail:
+        return status;
+}
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *inode,
                                   struct dentry *dentry)
@@ -2615,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        }
        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-                                  orphan_dir_bh);
+                                  orphan_dir_bh, false);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e5d059d4f115..5ddecce172fa 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                     handle_t *handle,
                     struct inode *orphan_dir_inode,
                     struct inode *inode,
-                     struct buffer_head *orphan_dir_bh);
+                     struct buffer_head *orphan_dir_bh,
+                     bool dio);
 int ocfs2_create_inode_in_orphan(struct inode *dir,
                                 int mode,
                                 struct inode **new_inode);
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+                struct inode *inode);
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+                struct inode *inode, int update_isize,
+                loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *new_inode,
                                   struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7d6b7d090452..8490c64d34fe 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,11 @@ struct ocfs2_lock_res {
 #endif
 };
+enum ocfs2_orphan_reco_type {
+        ORPHAN_NO_NEED_TRUNCATE = 0,
+        ORPHAN_NEED_TRUNCATE,
+};
 enum ocfs2_orphan_scan_state {
        ORPHAN_SCAN_ACTIVE,
        ORPHAN_SCAN_INACTIVE
@@ -279,6 +284,8 @@ enum ocfs2_mount_options
                                                     writes */
        OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
        OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
+        OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15,  /* Journal Async Commit */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -493,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+                return 1;
+        return 0;
+}
 static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
 {
        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -724,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
        return clusters;
 }
+static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
+                u64 bytes)
+{
+        int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned int clusters;
+        clusters = (unsigned int)(bytes >> cl_bits);
+        return clusters;
+}
 static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
                                         u64 bytes)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a10d5d..20e37a3ed26f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -105,7 +105,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
-                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
+                                         | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -199,6 +200,11 @@
 #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA        0x0002
 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA        0x0004
+/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO      0x0008
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -229,6 +235,8 @@
 #define OCFS2_CHAIN_FL          (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL        (0x00000800)    /* Truncate log */
 #define OCFS2_QUOTA_FL          (0x00001000)    /* Quota file */
+#define OCFS2_DIO_ORPHANED_FL   (0X00002000)    /* On the orphan list especially
+                                                 * for dio */
 /*
 * Flags on ocfs2_dinode.i_dyn_features
@@ -729,7 +737,9 @@ struct ocfs2_dinode {
                                           inode belongs to.  Only valid
                                           if allocated from a
                                           discontiguous block group */
-/*A0*/  __le64 i_reserved2[3];
+/*A0*/  __le16 i_dio_orphaned_slot;     /* only used for append dio write */
+        __le16 i_reserved1[3];
+        __le64 i_reserved2[2];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 1eae330193a6..b6d51333ad02 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -48,6 +48,7 @@ struct ocfs2_quota_recovery {
 /* In-memory structure with quota header information */
 struct ocfs2_mem_dqinfo {
        unsigned int dqi_type;          /* Quota type this structure describes */
+        unsigned int dqi_flags;         /* Flags OLQF_* */
        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 10b653930ee2..3d0b63d34225 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
               ol_dqblk_block_off(sb, c, off);
 }
-/* Compute block number from given offset */
-static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
-{
-        return off >> sb->s_blocksize_bits;
-}
 static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
 {
        return off & ((1 << sb->s_blocksize_bits) - 1);
@@ -292,7 +286,7 @@ static void olq_update_info(struct buffer_head *bh, void *private)
        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
                                                OCFS2_LOCAL_INFO_OFF);
        spin_lock(&dq_data_lock);
-        ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
        ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
        ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
        spin_unlock(&dq_data_lock);
@@ -701,8 +695,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        /* We don't need the lock and we have to acquire quota file locks
         * which will later depend on this lock */
        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
-        info->dqi_maxblimit = 0x7fffffffffffffffLL;
+        info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
-        info->dqi_maxilimit = 0x7fffffffffffffffLL;
+        info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
        if (!oinfo) {
                mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
@@ -737,13 +731,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        }
        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
                                                OCFS2_LOCAL_INFO_OFF);
-        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+        oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
        oinfo->dqi_libh = bh;
        /* We crashed when using local quota file? */
-        if (!(info->dqi_flags & OLQF_CLEAN)) {
+        if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
                rec = OCFS2_SB(sb)->quota_rec;
                if (!rec) {
                        rec = ocfs2_alloc_quota_recovery();
@@ -772,7 +766,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        }
        /* Now mark quota file as used */
-        info->dqi_flags &= ~OLQF_CLEAN;
+        oinfo->dqi_flags &= ~OLQF_CLEAN;
        status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
        if (status < 0) {
                mlog_errno(status);
@@ -857,7 +851,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
                goto out;
        /* Mark local file as clean */
-        info->dqi_flags |= OLQF_CLEAN;
+        oinfo->dqi_flags |= OLQF_CLEAN;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
                                 oinfo->dqi_libh,
                                 olq_update_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d81f6e2a97f5..ee541f92dab4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
                        get_bh(prev_bh);
                }
-                rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
                trace_ocfs2_calc_refcount_meta_credits_iterate(
                                recs_add, (unsigned long long)cpos, clusters,
                                (unsigned long long)le64_to_cpu(rec.r_cpos),
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36c689c..6a348b0294ab 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -39,7 +39,7 @@
 #define OCFS2_CHECK_RESERVATIONS
 #endif
-DEFINE_SPINLOCK(resv_lock);
+static DEFINE_SPINLOCK(resv_lock);
 #define OCFS2_MIN_RESV_WINDOW_BITS      8
 #define OCFS2_MAX_RESV_WINDOW_BITS      1024
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83723179e1ec..26675185b886 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -191,6 +191,7 @@ enum {
        Opt_coherency_full,
        Opt_resv_level,
        Opt_dir_resv_level,
+        Opt_journal_async_commit,
        Opt_err,
 };
@@ -222,6 +223,7 @@ static const match_table_t tokens = {
        {Opt_coherency_full, "coherency=full"},
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
+        {Opt_journal_async_commit, "journal_async_commit"},
        {Opt_err, NULL}
 };
@@ -1000,36 +1002,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
        }
 }
-/* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-{
-        unsigned int feature[OCFS2_MAXQUOTAS] = {
-                                        OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-                                        OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
-                return -EINVAL;
-        return dquot_enable(sb_dqopt(sb)->files[type], type,
-                            format_id, DQUOT_LIMITS_ENABLED);
-}
-/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type)
-{
-        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-static const struct quotactl_ops ocfs2_quotactl_ops = {
-        .quota_on_meta  = ocfs2_quota_on,
-        .quota_off      = ocfs2_quota_off,
-        .quota_sync     = dquot_quota_sync,
-        .get_info       = dquot_get_dqinfo,
-        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = dquot_set_dqblk,
-};
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -1500,6 +1472,9 @@ static int ocfs2_parse_options(struct super_block *sb,
                            option < OCFS2_MAX_RESV_LEVEL)
                                mopt->dir_resv_level = option;
                        break;
+                case Opt_journal_async_commit:
+                        mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1606,6 +1581,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
        if (osb->osb_dir_resv_level != osb->osb_resv_level)
                seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+        if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+                seq_printf(s, ",journal_async_commit");
        return 0;
 }
@@ -1768,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
+        init_waitqueue_head(&oi->append_dio_wq);
        ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
                                  &ocfs2_inode_caching_ops);
@@ -2079,7 +2059,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_op = &ocfs2_sops;
        sb->s_d_op = &ocfs2_dentry_ops;
        sb->s_export_op = &ocfs2_export_ops;
-        sb->s_qcop = &ocfs2_quotactl_ops;
+        sb->s_qcop = &dquot_quotactl_sysfile_ops;
        sb->dq_op = &ocfs2_quota_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
        sb->s_xattr = ocfs2_xattr_handlers;
@@ -2475,6 +2455,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                goto finally;
        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+                jbd2_journal_set_features(osb->journal->j_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+        else
+                jbd2_journal_clear_features(osb->journal->j_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        if (dirty) {
                /* recover my local alloc if we didn't unmount cleanly. */
                status = ocfs2_begin_local_alloc_recovery(osb,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 662f8dee149f..85b190dc132f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5334,16 +5334,6 @@ out:
        return ret;
 }
-static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
-                                        struct ocfs2_xattr_bucket *bucket,
-                                        int offs)
-{
-        int block_off = offs >> inode->i_sb->s_blocksize_bits;
-        offs = offs % inode->i_sb->s_blocksize;
-        return bucket_block(bucket, block_off) + offs;
-}
 /*
 * Truncate the specified xe_off entry in xattr bucket.
 * bucket is indicated by header_bh and len is the new length.
diff --git a/fs/open.c b/fs/open.c
index 813be037b412..33f9cbf2610b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
 {
        /* NB: we're sure to have correct a_ops only after f_op->open */
        if (f->f_flags & O_DIRECT) {
-                if (!f->f_mapping->a_ops ||
+                if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
-                    ((!f->f_mapping->a_ops->direct_IO) &&
-                    (!f->f_mapping->a_ops->get_xip_mem))) {
                        return -EINVAL;
-                }
        }
        return 0;
 }
@@ -971,8 +968,14 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode)
 */
 struct file *filp_open(const char *filename, int flags, umode_t mode)
 {
-        struct filename name = {.name = filename};
+        struct filename *name = getname_kernel(filename);
-        return file_open_name(&name, flags, mode);
+        struct file *file = ERR_CAST(name);
+        
+        if (!IS_ERR(name)) {
+                file = file_open_name(name, flags, mode);
+                putname(name);
+        }
+        return file;
 }
 EXPORT_SYMBOL(filp_open);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bd117d065b82..1295a00ca316 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/string_helpers.h>
 #include <linux/user_namespace.h>
 #include <asm/pgtable.h>
@@ -89,39 +90,18 @@
 static inline void task_name(struct seq_file *m, struct task_struct *p)
 {
-        int i;
+        char *buf;
-        char *buf, *end;
-        char *name;
        char tcomm[sizeof(p->comm)];
        get_task_comm(tcomm, p);
        seq_puts(m, "Name:\t");
-        end = m->buf + m->size;
        buf = m->buf + m->count;
-        name = tcomm;
-        i = sizeof(tcomm);
+        /* Ignore error for now */
-        while (i && (buf < end)) {
+        string_escape_str(tcomm, &buf, m->size - m->count,
-                unsigned char c = *name;
+                          ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
-                name++;
-                i--;
-                *buf = c;
-                if (!c)
-                        break;
-                if (c == '\\') {
-                        buf++;
-                        if (buf < end)
-                                *buf++ = c;
-                        continue;
-                }
-                if (c == '\n') {
-                        *buf++ = '\\';
-                        if (buf < end)
-                                *buf++ = 'n';
-                        continue;
-                }
-                buf++;
-        }
        m->count = buf - m->buf;
        seq_putc(m, '\n');
 }
@@ -336,12 +316,10 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_puts(m, "Cpus_allowed:\t");
+        seq_printf(m, "Cpus_allowed:\t%*pb\n",
-        seq_cpumask(m, &task->cpus_allowed);
+                   cpumask_pr_args(&task->cpus_allowed));
-        seq_putc(m, '\n');
+        seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-        seq_puts(m, "Cpus_allowed_list:\t");
+                   cpumask_pr_args(&task->cpus_allowed));
-        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_putc(m, '\n');
 }
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7fea13229f33..3309f59d421b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -122,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
                        struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
-        struct proc_dir_entry *de = PROC_I(inode)->pde;
+        struct proc_dir_entry *de = PDE(inode);
        if (de && de->nlink)
                set_nlink(inode, de->nlink);
@@ -350,29 +350,12 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        if (ret)
                return ret;
-        if (S_ISDIR(dp->mode)) {
-                dp->proc_fops = &proc_dir_operations;
-                dp->proc_iops = &proc_dir_inode_operations;
-                dir->nlink++;
-        } else if (S_ISLNK(dp->mode)) {
-                dp->proc_iops = &proc_link_inode_operations;
-        } else if (S_ISREG(dp->mode)) {
-                BUG_ON(dp->proc_fops == NULL);
-                dp->proc_iops = &proc_file_inode_operations;
-        } else {
-                WARN_ON(1);
-                proc_free_inum(dp->low_ino);
-                return -EINVAL;
-        }
        spin_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
                spin_unlock(&proc_subdir_lock);
-                if (S_ISDIR(dp->mode))
-                        dir->nlink--;
                proc_free_inum(dp->low_ino);
                return -EEXIST;
        }
@@ -431,6 +414,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
                ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
                if (ent->data) {
                        strcpy((char*)ent->data,dest);
+                        ent->proc_iops = &proc_link_inode_operations;
                        if (proc_register(parent, ent) < 0) {
                                kfree(ent->data);
                                kfree(ent);
@@ -456,8 +440,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
        ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
        if (ent) {
                ent->data = data;
+                ent->proc_fops = &proc_dir_operations;
+                ent->proc_iops = &proc_dir_inode_operations;
+                parent->nlink++;
                if (proc_register(parent, ent) < 0) {
                        kfree(ent);
+                        parent->nlink--;
                        ent = NULL;
                }
        }
@@ -493,6 +481,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                return NULL;
        }
+        BUG_ON(proc_fops == NULL);
        if ((mode & S_IALLUGO) == 0)
                mode |= S_IRUGO;
        pde = __proc_create(&parent, name, mode, 1);
@@ -500,6 +490,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                goto out;
        pde->proc_fops = proc_fops;
        pde->data = data;
+        pde->proc_iops = &proc_file_inode_operations;
        if (proc_register(parent, pde) < 0)
                goto out_free;
        return pde;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8420a2f80811..13a50a32652d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
        put_pid(PROC_I(inode)->pid);
        /* Let go of any associated proc directory entry */
-        de = PROC_I(inode)->pde;
+        de = PDE(inode);
        if (de)
                pde_put(de);
        head = PROC_I(inode)->sysctl;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
 #include <linux/ksm.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/huge_mm.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
         * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
         * to make sure a given page is a thp, not a non-huge compound page.
         */
-        else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
+        else if (PageTransCompound(page)) {
-                                             PageAnon(compound_head(page))))
+                struct page *head = compound_head(page);
-                u |= 1 << KPF_THP;
+                if (PageLRU(head) || PageAnon(head))
+                        u |= 1 << KPF_THP;
+                else if (is_huge_zero_page(head)) {
+                        u |= 1 << KPF_ZERO_PAGE;
+                        u |= 1 << KPF_THP;
+                }
+        } else if (is_zero_pfn(page_to_pfn(page)))
+                u |= 1 << KPF_ZERO_PAGE;
        /*
         * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 246eae84b13b..956b75d61809 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-        unsigned long data, text, lib, swap;
+        unsigned long data, text, lib, swap, ptes, pmds;
        unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
        /*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
        swap = get_mm_counter(mm, MM_SWAPENTS);
+        ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+        pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
        seq_printf(m,
                "VmPeak:\t%8lu kB\n"
                "VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                "VmExe:\t%8lu kB\n"
                "VmLib:\t%8lu kB\n"
                "VmPTE:\t%8lu kB\n"
+                "VmPMD:\t%8lu kB\n"
                "VmSwap:\t%8lu kB\n",
                hiwater_vm << (PAGE_SHIFT-10),
                total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                total_rss << (PAGE_SHIFT-10),
                data << (PAGE_SHIFT-10),
                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-                (PTRS_PER_PTE * sizeof(pte_t) *
+                ptes >> 10,
-                 atomic_long_read(&mm->nr_ptes)) >> 10,
+                pmds >> 10,
                swap << (PAGE_SHIFT-10));
 }
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 struct mem_size_stats {
-        struct vm_area_struct *vma;
        unsigned long resident;
        unsigned long shared_clean;
        unsigned long shared_dirty;
@@ -443,7 +445,6 @@ struct mem_size_stats {
        unsigned long anonymous;
        unsigned long anonymous_thp;
        unsigned long swap;
-        unsigned long nonlinear;
        u64 pss;
 };
@@ -483,8 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
-        struct vm_area_struct *vma = mss->vma;
+        struct vm_area_struct *vma = walk->vma;
-        pgoff_t pgoff = linear_page_index(vma, addr);
        struct page *page = NULL;
        if (pte_present(*pte)) {
@@ -496,17 +496,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                        mss->swap += PAGE_SIZE;
                else if (is_migration_entry(swpent))
                        page = migration_entry_to_page(swpent);
-        } else if (pte_file(*pte)) {
-                if (pte_to_pgoff(*pte) != pgoff)
-                        mss->nonlinear += PAGE_SIZE;
        }
        if (!page)
                return;
-        if (page->index != pgoff)
-                mss->nonlinear += PAGE_SIZE;
        smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
 }
@@ -515,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
                struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
-        struct vm_area_struct *vma = mss->vma;
+        struct vm_area_struct *vma = walk->vma;
        struct page *page;
        /* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -536,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                           struct mm_walk *walk)
 {
-        struct mem_size_stats *mss = walk->private;
+        struct vm_area_struct *vma = walk->vma;
-        struct vm_area_struct *vma = mss->vma;
        pte_t *pte;
        spinlock_t *ptl;
@@ -596,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_ACCOUNT)]     = "ac",
                [ilog2(VM_NORESERVE)]   = "nr",
                [ilog2(VM_HUGETLB)]     = "ht",
-                [ilog2(VM_NONLINEAR)]   = "nl",
                [ilog2(VM_ARCH_1)]      = "ar",
                [ilog2(VM_DONTDUMP)]    = "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -630,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
        };
        memset(&mss, 0, sizeof mss);
-        mss.vma = vma;
        /* mmap_sem is held in m_start */
-        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+        walk_page_vma(vma, &smaps_walk);
-                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
        show_map_vma(m, vma, is_pid);
@@ -668,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   (vma->vm_flags & VM_LOCKED) ?
                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
-        if (vma->vm_flags & VM_NONLINEAR)
-                seq_printf(m, "Nonlinear:      %8lu kB\n",
-                                mss.nonlinear >> 10);
        show_smap_vma_flags(m, vma);
        m_cache_vma(m, vma);
        return 0;
@@ -747,18 +732,18 @@ enum clear_refs_types {
        CLEAR_REFS_ANON,
        CLEAR_REFS_MAPPED,
        CLEAR_REFS_SOFT_DIRTY,
+        CLEAR_REFS_MM_HIWATER_RSS,
        CLEAR_REFS_LAST,
 };
 struct clear_refs_private {
-        struct vm_area_struct *vma;
        enum clear_refs_types type;
 };
+#ifdef CONFIG_MEM_SOFT_DIRTY
 static inline void clear_soft_dirty(struct vm_area_struct *vma,
                unsigned long addr, pte_t *pte)
 {
-#ifdef CONFIG_MEM_SOFT_DIRTY
        /*
         * The soft-dirty tracker uses #PF-s to catch writes
         * to pages, so write-protect the pte as well. See the
@@ -772,24 +757,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
                ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
        } else if (is_swap_pte(ptent)) {
                ptent = pte_swp_clear_soft_dirty(ptent);
-        } else if (pte_file(ptent)) {
-                ptent = pte_file_clear_soft_dirty(ptent);
        }
        set_pte_at(vma->vm_mm, addr, pte, ptent);
-#endif
 }
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd = *pmdp;
+        pmd = pmd_wrprotect(pmd);
+        pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+        if (vma->vm_flags & VM_SOFTDIRTY)
+                vma->vm_flags &= ~VM_SOFTDIRTY;
+        set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+                unsigned long addr, pte_t *pte)
+{
+}
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
 {
        struct clear_refs_private *cp = walk->private;
-        struct vm_area_struct *vma = cp->vma;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *pte, ptent;
        spinlock_t *ptl;
        struct page *page;
-        split_huge_page_pmd(vma, addr, pmd);
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+                        clear_soft_dirty_pmd(vma, addr, pmd);
+                        goto out;
+                }
+                page = pmd_page(*pmd);
+                /* Clear accessed and referenced bits. */
+                pmdp_test_and_clear_young(vma, addr, pmd);
+                ClearPageReferenced(page);
+out:
+                spin_unlock(ptl);
+                return 0;
+        }
        if (pmd_trans_unstable(pmd))
                return 0;
@@ -818,6 +842,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        return 0;
 }
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+                                struct mm_walk *walk)
+{
+        struct clear_refs_private *cp = walk->private;
+        struct vm_area_struct *vma = walk->vma;
+        if (vma->vm_flags & VM_PFNMAP)
+                return 1;
+        /*
+         * Writing 1 to /proc/pid/clear_refs affects all pages.
+         * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+         * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+         * Writing 4 to /proc/pid/clear_refs affects all pages.
+         */
+        if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+                return 1;
+        if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+                return 1;
+        return 0;
+}
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
@@ -858,9 +904,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                };
                struct mm_walk clear_refs_walk = {
                        .pmd_entry = clear_refs_pte_range,
+                        .test_walk = clear_refs_test_walk,
                        .mm = mm,
                        .private = &cp,
                };
+                if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+                        /*
+                         * Writing 5 to /proc/pid/clear_refs resets the peak
+                         * resident set size to this mm's current rss value.
+                         */
+                        down_write(&mm->mmap_sem);
+                        reset_mm_hiwater_rss(mm);
+                        up_write(&mm->mmap_sem);
+                        goto out_mm;
+                }
                down_read(&mm->mmap_sem);
                if (type == CLEAR_REFS_SOFT_DIRTY) {
                        for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -877,32 +936,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                        }
                        mmu_notifier_invalidate_range_start(mm, 0, -1);
                }
-                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                walk_page_range(0, ~0UL, &clear_refs_walk);
-                        cp.vma = vma;
-                        if (is_vm_hugetlb_page(vma))
-                                continue;
-                        /*
-                         * Writing 1 to /proc/pid/clear_refs affects all pages.
-                         *
-                         * Writing 2 to /proc/pid/clear_refs only affects
-                         * Anonymous pages.
-                         *
-                         * Writing 3 to /proc/pid/clear_refs only affects file
-                         * mapped pages.
-                         *
-                         * Writing 4 to /proc/pid/clear_refs affects all pages.
-                         */
-                        if (type == CLEAR_REFS_ANON && vma->vm_file)
-                                continue;
-                        if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
-                                continue;
-                        walk_page_range(vma->vm_start, vma->vm_end,
-                                        &clear_refs_walk);
-                }
                if (type == CLEAR_REFS_SOFT_DIRTY)
                        mmu_notifier_invalidate_range_end(mm, 0, -1);
                flush_tlb_mm(mm);
                up_read(&mm->mmap_sem);
+out_mm:
                mmput(mm);
        }
        put_task_struct(task);
@@ -1066,15 +1105,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             struct mm_walk *walk)
 {
-        struct vm_area_struct *vma;
+        struct vm_area_struct *vma = walk->vma;
        struct pagemapread *pm = walk->private;
        spinlock_t *ptl;
-        pte_t *pte;
+        pte_t *pte, *orig_pte;
        int err = 0;
-        /* find the first VMA at or above 'addr' */
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-        vma = find_vma(walk->mm, addr);
-        if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                int pmd_flags2;
                if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1100,51 +1137,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        if (pmd_trans_unstable(pmd))
                return 0;
-        while (1) {
+        /*
-                /* End of address space hole, which we mark as non-present. */
+         * We can assume that @vma always points to a valid one and @end never
-                unsigned long hole_end;
+         * goes beyond vma->vm_end.
+         */
-                if (vma)
+        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
-                        hole_end = min(end, vma->vm_start);
+        for (; addr < end; pte++, addr += PAGE_SIZE) {
-                else
+                pagemap_entry_t pme;
-                        hole_end = end;
-                for (; addr < hole_end; addr += PAGE_SIZE) {
-                        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-                        err = add_to_pagemap(addr, &pme, pm);
-                        if (err)
-                                return err;
-                }
-                if (!vma || vma->vm_start >= end)
-                        break;
-                /*
-                 * We can't possibly be in a hugetlb VMA. In general,
-                 * for a mm_walk with a pmd_entry and a hugetlb_entry,
-                 * the pmd_entry can only be called on addresses in a
-                 * hugetlb if the walk starts in a non-hugetlb VMA and
-                 * spans a hugepage VMA. Since pagemap_read walks are
-                 * PMD-sized and PMD-aligned, this will never be true.
-                 */
-                BUG_ON(is_vm_hugetlb_page(vma));
-                /* Addresses in the VMA. */
-                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
-                        pagemap_entry_t pme;
-                        pte = pte_offset_map(pmd, addr);
-                        pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
-                        pte_unmap(pte);
-                        err = add_to_pagemap(addr, &pme, pm);
-                        if (err)
-                                return err;
-                }
-                if (addr == end)
+                pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+                err = add_to_pagemap(addr, &pme, pm);
+                if (err)
                        break;
-                vma = find_vma(walk->mm, addr);
        }
+        pte_unmap_unlock(orig_pte, ptl);
        cond_resched();
@@ -1170,15 +1176,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
                                 struct mm_walk *walk)
 {
        struct pagemapread *pm = walk->private;
-        struct vm_area_struct *vma;
+        struct vm_area_struct *vma = walk->vma;
        int err = 0;
        int flags2;
        pagemap_entry_t pme;
-        vma = find_vma(walk->mm, addr);
+        if (vma->vm_flags & VM_SOFTDIRTY)
-        WARN_ON_ONCE(!vma);
-        if (vma && (vma->vm_flags & VM_SOFTDIRTY))
                flags2 = __PM_SOFT_DIRTY;
        else
                flags2 = 0;
@@ -1338,7 +1341,6 @@ const struct file_operations proc_pagemap_operations = {
 #ifdef CONFIG_NUMA
 struct numa_maps {
-        struct vm_area_struct *vma;
        unsigned long pages;
        unsigned long anon;
        unsigned long active;
@@ -1407,18 +1409,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                unsigned long end, struct mm_walk *walk)
 {
-        struct numa_maps *md;
+        struct numa_maps *md = walk->private;
+        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *orig_pte;
        pte_t *pte;
-        md = walk->private;
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-        if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
-                page = can_gather_numa_stats(huge_pte, md->vma, addr);
+                page = can_gather_numa_stats(huge_pte, vma, addr);
                if (page)
                        gather_stats(page, md, pte_dirty(huge_pte),
                                     HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1430,7 +1431,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                return 0;
        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        do {
-                struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+                struct page *page = can_gather_numa_stats(*pte, vma, addr);
                if (!page)
                        continue;
                gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1440,7 +1441,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        return 0;
 }
 #ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
        struct numa_maps *md;
@@ -1459,7 +1460,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
 }
 #else
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
        return 0;
@@ -1477,7 +1478,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        struct numa_maps *md = &numa_priv->md;
        struct file *file = vma->vm_file;
        struct mm_struct *mm = vma->vm_mm;
-        struct mm_walk walk = {};
+        struct mm_walk walk = {
+                .hugetlb_entry = gather_hugetlb_stats,
+                .pmd_entry = gather_pte_stats,
+                .private = md,
+                .mm = mm,
+        };
        struct mempolicy *pol;
        char buffer[64];
        int nid;
@@ -1488,13 +1494,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        /* Ensure we start with an empty set of numa_maps statistics. */
        memset(md, 0, sizeof(*md));
-        md->vma = vma;
-        walk.hugetlb_entry = gather_hugetbl_stats;
-        walk.pmd_entry = gather_pte_stats;
-        walk.private = md;
-        walk.mm = mm;
        pol = __get_vma_policy(vma, vma->vm_start);
        if (pol) {
                mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1528,7 +1527,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        if (is_vm_hugetlb_page(vma))
                seq_puts(m, " huge");
-        walk_page_range(vma->vm_start, vma->vm_end, &walk);
+        /* mmap_sem is held by m_start */
+        walk_page_vma(vma, &walk);
        if (!md->pages)
                goto out;
@@ -1557,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        for_each_node_state(nid, N_MEMORY)
                if (md->node[nid])
                        seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+        seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
 out:
        seq_putc(m, '\n');
        m_cache_vma(m, vma);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a90d6d354199..4e61388ec03d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
                nhdr_ptr = notes_section;
                while (nhdr_ptr->n_namesz != 0) {
                        sz = sizeof(Elf64_Nhdr) +
-                                ((nhdr_ptr->n_namesz + 3) & ~3) +
+                                (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
-                                ((nhdr_ptr->n_descsz + 3) & ~3);
+                                (((u64)nhdr_ptr->n_descsz + 3) & ~3);
                        if ((real_sz + sz) > max_sz) {
                                pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
                                        nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
@@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
                nhdr_ptr = notes_section;
                while (nhdr_ptr->n_namesz != 0) {
                        sz = sizeof(Elf32_Nhdr) +
-                                ((nhdr_ptr->n_namesz + 3) & ~3) +
+                                (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
-                                ((nhdr_ptr->n_descsz + 3) & ~3);
+                                (((u64)nhdr_ptr->n_descsz + 3) & ~3);
                        if ((real_sz + sz) > max_sz) {
                                pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
                                        nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
                { MS_SYNCHRONOUS, ",sync" },
                { MS_DIRSYNC, ",dirsync" },
                { MS_MANDLOCK, ",mand" },
+                { MS_LAZYTIME, ",lazytime" },
                { 0, NULL }
        };
        const struct proc_fs_info *fs_infop;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
          When the option is enabled, pstore will log all kernel
          messages, even if no oops or panic happened.
+config PSTORE_PMSG
+        bool "Log user space messages"
+        depends on PSTORE
+        help
+          When the option is enabled, pstore will export a character
+          interface /dev/pmsg0 to log user space messages. On reboot
+          data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+          If unsure, say N.
 config PSTORE_FTRACE
        bool "Persistent function tracer"
        depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
 pstore-objs += inode.o platform.o
 obj-$(CONFIG_PSTORE_FTRACE)     += ftrace.o
+obj-$(CONFIG_PSTORE_PMSG)       += pmsg.o
 ramoops-objs += ram.o ram_core.o
 obj-$(CONFIG_PSTORE_RAM)        += ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 50416602774d..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -338,32 +338,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
        switch (type) {
        case PSTORE_TYPE_DMESG:
-                sprintf(name, "dmesg-%s-%lld%s", psname, id,
+                scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
-                                                compressed ? ".enc.z" : "");
+                          psname, id, compressed ? ".enc.z" : "");
                break;
        case PSTORE_TYPE_CONSOLE:
-                sprintf(name, "console-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_FTRACE:
-                sprintf(name, "ftrace-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_MCE:
-                sprintf(name, "mce-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_PPC_RTAS:
-                sprintf(name, "rtas-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_PPC_OF:
-                sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
+                          psname, id);
                break;
        case PSTORE_TYPE_PPC_COMMON:
-                sprintf(name, "powerpc-common-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
+                          psname, id);
+                break;
+        case PSTORE_TYPE_PMSG:
+                scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_UNKNOWN:
-                sprintf(name, "unknown-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
                break;
        default:
-                sprintf(name, "type%d-%s-%lld", type, psname, id);
+                scnprintf(name, sizeof(name), "type%d-%s-%lld",
+                          type, psname, id);
                break;
        }
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
 static inline void pstore_register_ftrace(void) {}
 #endif
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
 extern struct pstore_info *psinfo;
 extern void     pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..c4c9a10c5760 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -301,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                if (big_oops_buf) {
                        dst = big_oops_buf;
-                        hsize = sprintf(dst, "%s#%d Part%d\n", why,
+                        hsize = sprintf(dst, "%s#%d Part%u\n", why,
                                                        oopscount, part);
                        size = big_oops_buf_sz - hsize;
@@ -321,7 +321,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                        }
                } else {
                        dst = psinfo->buf;
-                        hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount,
+                        hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
                                                                        part);
                        size = psinfo->bufsize - hsize;
                        dst += hsize;
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
        if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
                pstore_register_console();
                pstore_register_ftrace();
+                pstore_register_pmsg();
        }
        if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014  Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+static DEFINE_MUTEX(pmsg_lock);
+#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+        size_t i, buffer_size;
+        char *buffer;
+        if (!count)
+                return 0;
+        if (!access_ok(VERIFY_READ, buf, count))
+                return -EFAULT;
+        buffer_size = count;
+        if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
+                buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
+        buffer = vmalloc(buffer_size);
+        mutex_lock(&pmsg_lock);
+        for (i = 0; i < count; ) {
+                size_t c = min(count - i, buffer_size);
+                u64 id;
+                long ret;
+                ret = __copy_from_user(buffer, buf + i, c);
+                if (unlikely(ret != 0)) {
+                        mutex_unlock(&pmsg_lock);
+                        vfree(buffer);
+                        return -EFAULT;
+                }
+                psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
+                                  psinfo);
+                i += c;
+        }
+        mutex_unlock(&pmsg_lock);
+        vfree(buffer);
+        return count;
+}
+static const struct file_operations pmsg_fops = {
+        .owner          = THIS_MODULE,
+        .llseek         = noop_llseek,
+        .write          = write_pmsg,
+};
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+        if (mode)
+                *mode = 0220;
+        return NULL;
+}
+void pstore_register_pmsg(void)
+{
+        struct device *pmsg_device;
+        pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+        if (pmsg_major < 0) {
+                pr_err("register_chrdev failed\n");
+                goto err;
+        }
+        pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+        if (IS_ERR(pmsg_class)) {
+                pr_err("device class file already in use\n");
+                goto err_class;
+        }
+        pmsg_class->devnode = pmsg_devnode;
+        pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+                                        NULL, "%s%d", PMSG_NAME, 0);
+        if (IS_ERR(pmsg_device)) {
+                pr_err("failed to create device\n");
+                goto err_device;
+        }
+        return;
+err_device:
+        class_destroy(pmsg_class);
+err_class:
+        unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+        return;
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8613e5b35c22..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
 module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
 MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
 static ulong mem_address;
 module_param(mem_address, ulong, 0400);
 MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
        struct persistent_ram_zone **przs;
        struct persistent_ram_zone *cprz;
        struct persistent_ram_zone *fprz;
+        struct persistent_ram_zone *mprz;
        phys_addr_t phys_addr;
        unsigned long size;
        unsigned int memtype;
        size_t record_size;
        size_t console_size;
        size_t ftrace_size;
+        size_t pmsg_size;
        int dump_oops;
        struct persistent_ram_ecc_info ecc_info;
        unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
        unsigned int dump_read_cnt;
        unsigned int console_read_cnt;
        unsigned int ftrace_read_cnt;
+        unsigned int pmsg_read_cnt;
        struct pstore_info pstore;
 };
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
        cxt->dump_read_cnt = 0;
        cxt->console_read_cnt = 0;
        cxt->ftrace_read_cnt = 0;
+        cxt->pmsg_read_cnt = 0;
        return 0;
 }
@@ -164,6 +172,12 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
        return header_length;
 }
+static bool prz_ok(struct persistent_ram_zone *prz)
+{
+        return !!prz && !!(persistent_ram_old_size(prz) +
+                           persistent_ram_ecc_string(prz, NULL, 0));
+}
 static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
                                   int *count, struct timespec *time,
                                   char **buf, bool *compressed,
@@ -178,13 +192,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
        prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
                                   cxt->max_dump_cnt, id, type,
                                   PSTORE_TYPE_DMESG, 1);
-        if (!prz)
+        if (!prz_ok(prz))
                prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
                                           1, id, type, PSTORE_TYPE_CONSOLE, 0);
-        if (!prz)
+        if (!prz_ok(prz))
                prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
                                           1, id, type, PSTORE_TYPE_FTRACE, 0);
-        if (!prz)
+        if (!prz_ok(prz))
+                prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+                                           1, id, type, PSTORE_TYPE_PMSG, 0);
+        if (!prz_ok(prz))
                return 0;
        if (!persistent_ram_old(prz))
@@ -252,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
                        return -ENOMEM;
                persistent_ram_write(cxt->fprz, buf, size);
                return 0;
+        } else if (type == PSTORE_TYPE_PMSG) {
+                if (!cxt->mprz)
+                        return -ENOMEM;
+                persistent_ram_write(cxt->mprz, buf, size);
+                return 0;
        }
        if (type != PSTORE_TYPE_DMESG)
@@ -309,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
        case PSTORE_TYPE_FTRACE:
                prz = cxt->fprz;
                break;
+        case PSTORE_TYPE_PMSG:
+                prz = cxt->mprz;
+                break;
        default:
                return -EINVAL;
        }
@@ -435,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
                goto fail_out;
        if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
-                        !pdata->ftrace_size)) {
+                        !pdata->ftrace_size && !pdata->pmsg_size)) {
                pr_err("The memory size and the record/console size must be "
                        "non-zero\n");
                goto fail_out;
@@ -447,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
                pdata->console_size = rounddown_pow_of_two(pdata->console_size);
        if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
                pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+        if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+                pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
        cxt->size = pdata->mem_size;
        cxt->phys_addr = pdata->mem_address;
@@ -454,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
        cxt->record_size = pdata->record_size;
        cxt->console_size = pdata->console_size;
        cxt->ftrace_size = pdata->ftrace_size;
+        cxt->pmsg_size = pdata->pmsg_size;
        cxt->dump_oops = pdata->dump_oops;
        cxt->ecc_info = pdata->ecc_info;
        paddr = cxt->phys_addr;
-        dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size;
+        dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+                        - cxt->pmsg_size;
        err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
        if (err)
                goto fail_out;
@@ -474,13 +503,9 @@ static int ramoops_probe(struct platform_device *pdev)
        if (err)
                goto fail_init_fprz;
-        if (!cxt->przs && !cxt->cprz && !cxt->fprz) {
+        err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
-                pr_err("memory size too small, minimum is %zu\n",
+        if (err)
-                        cxt->console_size + cxt->record_size +
+                goto fail_init_mprz;
-                        cxt->ftrace_size);
-                err = -EINVAL;
-                goto fail_cnt;
-        }
        cxt->pstore.data = cxt;
        /*
@@ -525,7 +550,8 @@ fail_buf:
        kfree(cxt->pstore.buf);
 fail_clear:
        cxt->pstore.bufsize = 0;
-fail_cnt:
+        kfree(cxt->mprz);
+fail_init_mprz:
        kfree(cxt->fprz);
 fail_init_fprz:
        kfree(cxt->cprz);
@@ -583,6 +609,7 @@ static void ramoops_register_dummy(void)
        dummy_data->record_size = record_size;
        dummy_data->console_size = ramoops_console_size;
        dummy_data->ftrace_size = ramoops_ftrace_size;
+        dummy_data->pmsg_size = ramoops_pmsg_size;
        dummy_data->dump_oops = dump_oops;
        /*
         * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index c51df1dd237e..4a09975aac90 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -5,6 +5,7 @@
 config QUOTA
        bool "Quota support"
        select QUOTACTL
+        select SRCU
        help
          If you say Y here, you will be able to set per user limits for disk
          usage (also called disk quotas). Currently, it works for the
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 69df5b239844..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
-                !(info->dqi_flags & V1_DQF_RSQUASH));
+                !(info->dqi_flags & DQF_ROOT_SQUASH));
 }
 /* needs dq_data_lock */
@@ -2385,14 +2385,84 @@ out:
 }
 EXPORT_SYMBOL(dquot_quota_on_mount);
-static inline qsize_t qbtos(qsize_t blocks)
+static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
 {
-        return blocks << QIF_DQBLKSIZE_BITS;
+        int ret;
+        int type;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+                return -ENOSYS;
+        /* Accounting cannot be turned on while fs is mounted */
+        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
+        if (!flags)
+                return -EINVAL;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!(flags & qtype_enforce_flag(type)))
+                        continue;
+                /* Can't enforce without accounting */
+                if (!sb_has_quota_usage_enabled(sb, type))
+                        return -EINVAL;
+                ret = dquot_enable(dqopt->files[type], type,
+                                   dqopt->info[type].dqi_fmt_id,
+                                   DQUOT_LIMITS_ENABLED);
+                if (ret < 0)
+                        goto out_err;
+        }
+        return 0;
+out_err:
+        /* Backout enforcement enablement we already did */
+        for (type--; type >= 0; type--)  {
+                if (flags & qtype_enforce_flag(type))
+                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+        }
+        /* Error code translation for better compatibility with XFS */
+        if (ret == -EBUSY)
+                ret = -EEXIST;
+        return ret;
 }
-static inline qsize_t stoqb(qsize_t space)
+static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
 {
-        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+        int ret;
+        int type;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+                return -ENOSYS;
+        /*
+         * We don't support turning off accounting via quotactl. In principle
+         * quota infrastructure can do this but filesystems don't expect
+         * userspace to be able to do it.
+         */
+        if (flags &
+                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
+                return -EOPNOTSUPP;
+        /* Filter out limits not enabled */
+        for (type = 0; type < MAXQUOTAS; type++)
+                if (!sb_has_quota_limits_enabled(sb, type))
+                        flags &= ~qtype_enforce_flag(type);
+        /* Nothing left? */
+        if (!flags)
+                return -EEXIST;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (flags & qtype_enforce_flag(type)) {
+                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+                        if (ret < 0)
+                                goto out_err;
+                }
+        }
+        return 0;
+out_err:
+        /* Backout enforcement disabling we already did */
+        for (type--; type >= 0; type--)  {
+                if (flags & qtype_enforce_flag(type))
+                        dquot_enable(dqopt->files[type], type,
+                                     dqopt->info[type].dqi_fmt_id,
+                                     DQUOT_LIMITS_ENABLED);
+        }
+        return ret;
 }
 /* Generic routine for getting common part of quota structure */
@@ -2444,13 +2514,13 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
                return -EINVAL;
        if (((di->d_fieldmask & QC_SPC_SOFT) &&
-             stoqb(di->d_spc_softlimit) > dqi->dqi_maxblimit) ||
+             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_SPC_HARD) &&
-             stoqb(di->d_spc_hardlimit) > dqi->dqi_maxblimit) ||
+             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_INO_SOFT) &&
-             (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
            ((di->d_fieldmask & QC_INO_HARD) &&
-             (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
+             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;
        spin_lock(&dq_data_lock);
@@ -2577,6 +2647,14 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
                goto out;
        }
        mi = sb_dqopt(sb)->info + type;
+        if (ii->dqi_valid & IIF_FLAGS) {
+                if (ii->dqi_flags & ~DQF_SETINFO_MASK ||
+                    (ii->dqi_flags & DQF_ROOT_SQUASH &&
+                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
+                        err = -EINVAL;
+                        goto out;
+                }
+        }
        spin_lock(&dq_data_lock);
        if (ii->dqi_valid & IIF_BGRACE)
                mi->dqi_bgrace = ii->dqi_bgrace;
@@ -2606,6 +2684,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
 };
 EXPORT_SYMBOL(dquot_quotactl_ops);
+const struct quotactl_ops dquot_quotactl_sysfile_ops = {
+        .quota_enable   = dquot_quota_enable,
+        .quota_disable  = dquot_quota_disable,
+        .quota_sync     = dquot_quota_sync,
+        .get_info       = dquot_get_dqinfo,
+        .set_info       = dquot_set_dqinfo,
+        .get_dqblk      = dquot_get_dqblk,
+        .set_dqblk      = dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
 static int do_proc_dqstats(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 6f3856328eea..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,40 @@ static int quota_sync_all(int type)
        return ret;
 }
+unsigned int qtype_enforce_flag(int type)
+{
+        switch (type) {
+        case USRQUOTA:
+                return FS_QUOTA_UDQ_ENFD;
+        case GRPQUOTA:
+                return FS_QUOTA_GDQ_ENFD;
+        case PRJQUOTA:
+                return FS_QUOTA_PDQ_ENFD;
+        }
+        return 0;
+}
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
                         struct path *path)
 {
-        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
                return -ENOSYS;
-        if (sb->s_qcop->quota_on_meta)
+        if (sb->s_qcop->quota_enable)
-                return sb->s_qcop->quota_on_meta(sb, type, id);
+                return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
        if (IS_ERR(path))
                return PTR_ERR(path);
        return sb->s_qcop->quota_on(sb, type, id, path);
 }
+static int quota_quotaoff(struct super_block *sb, int type)
+{
+        if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
+                return -ENOSYS;
+        if (sb->s_qcop->quota_disable)
+                return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
+        return sb->s_qcop->quota_off(sb, type);
+}
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
        __u32 fmt;
@@ -208,15 +230,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
        return sb->s_qcop->set_dqblk(sb, qid, &fdq);
 }
-static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+static int quota_enable(struct super_block *sb, void __user *addr)
 {
        __u32 flags;
        if (copy_from_user(&flags, addr, sizeof(flags)))
                return -EFAULT;
-        if (!sb->s_qcop->set_xstate)
+        if (!sb->s_qcop->quota_enable)
                return -ENOSYS;
-        return sb->s_qcop->set_xstate(sb, flags, cmd);
+        return sb->s_qcop->quota_enable(sb, flags);
+}
+static int quota_disable(struct super_block *sb, void __user *addr)
+{
+        __u32 flags;
+        if (copy_from_user(&flags, addr, sizeof(flags)))
+                return -EFAULT;
+        if (!sb->s_qcop->quota_disable)
+                return -ENOSYS;
+        return sb->s_qcop->quota_disable(sb, flags);
 }
 static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -429,9 +462,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_QUOTAON:
                return quota_quotaon(sb, type, cmd, id, path);
        case Q_QUOTAOFF:
-                if (!sb->s_qcop->quota_off)
+                return quota_quotaoff(sb, type);
-                        return -ENOSYS;
-                return sb->s_qcop->quota_off(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
@@ -447,8 +478,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                        return -ENOSYS;
                return sb->s_qcop->quota_sync(sb, type);
        case Q_XQUOTAON:
+                return quota_enable(sb, addr);
        case Q_XQUOTAOFF:
-                return quota_setxstate(sb, cmd, addr);
+                return quota_disable(sb, addr);
        case Q_XQUOTARM:
                return quota_rmxquota(sb, addr);
        case Q_XGETQSTAT:
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
        }
        ret = 0;
        /* limits are stored as unsigned 32-bit data */
-        dqopt->info[type].dqi_maxblimit = 0xffffffff;
+        dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
-        dqopt->info[type].dqi_maxilimit = 0xffffffff;
+        dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
        dqopt->info[type].dqi_igrace =
                        dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
        dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 02751ec695c5..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,16 +117,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
        qinfo = info->dqi_priv;
        if (version == 0) {
                /* limits are stored as unsigned 32-bit data */
-                info->dqi_maxblimit = 0xffffffff;
+                info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
-                info->dqi_maxilimit = 0xffffffff;
+                info->dqi_max_ino_limit = 0xffffffff;
        } else {
-                /* used space is stored as unsigned 64-bit value */
+                /* used space is stored as unsigned 64-bit value in bytes */
-                info->dqi_maxblimit = 0xffffffffffffffffULL;    /* 2^64-1 */
+                info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
-                info->dqi_maxilimit = 0xffffffffffffffffULL;
+                info->dqi_max_ino_limit = 0xffffffffffffffffULL;
        }
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
-        info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
+        /* No flags currently supported */
+        info->dqi_flags = 0;
        qinfo->dqi_sb = sb;
        qinfo->dqi_type = type;
        qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
@@ -157,7 +158,8 @@ static int v2_write_file_info(struct super_block *sb, int type)
        info->dqi_flags &= ~DQF_INFO_DIRTY;
        dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
-        dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        /* No flags currently supported */
+        dinfo.dqi_flags = cpu_to_le32(0);
        spin_unlock(&dq_data_lock);
        dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
        dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
                                                   unsigned long flags);
 static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+        return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+                NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
 const struct file_operations ramfs_file_operations = {
+        .mmap_capabilities      = ramfs_mmap_capabilities,
        .mmap                   = ramfs_nommu_mmap,
        .get_unmapped_area      = ramfs_nommu_get_unmapped_area,
        .read                   = new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
        .set_page_dirty = __set_page_dirty_no_writeback,
 };
-static struct backing_dev_info ramfs_backing_dev_info = {
-        .name           = "ramfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK |
-                          BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
-                          BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
 struct inode *ramfs_get_inode(struct super_block *sb,
                                const struct inode *dir, umode_t mode, dev_t dev)
 {
@@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_mapping->a_ops = &ramfs_aops;
-                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
                mapping_set_unevictable(inode->i_mapping);
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = {
 int __init init_ramfs_fs(void)
 {
        static unsigned long once;
-        int err;
        if (test_and_set_bit(0, &once))
                return 0;
+        return register_filesystem(&ramfs_fs_type);
-        err = bdi_init(&ramfs_backing_dev_info);
-        if (err)
-                return err;
-        err = register_filesystem(&ramfs_fs_type);
-        if (err)
-                bdi_destroy(&ramfs_backing_dev_info);
-        return err;
 }
 fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..8e1b68786d66 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -333,6 +333,52 @@ out_putf:
 }
 #endif
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+        struct kiocb kiocb;
+        ssize_t ret;
+        if (!file->f_op->read_iter)
+                return -EINVAL;
+        init_sync_kiocb(&kiocb, file);
+        kiocb.ki_pos = *ppos;
+        kiocb.ki_nbytes = iov_iter_count(iter);
+        iter->type |= READ;
+        ret = file->f_op->read_iter(&kiocb, iter);
+        if (ret == -EIOCBQUEUED)
+                ret = wait_on_sync_kiocb(&kiocb);
+        if (ret > 0)
+                *ppos = kiocb.ki_pos;
+        return ret;
+}
+EXPORT_SYMBOL(vfs_iter_read);
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+        struct kiocb kiocb;
+        ssize_t ret;
+        if (!file->f_op->write_iter)
+                return -EINVAL;
+        init_sync_kiocb(&kiocb, file);
+        kiocb.ki_pos = *ppos;
+        kiocb.ki_nbytes = iov_iter_count(iter);
+        iter->type |= WRITE;
+        ret = file->f_op->write_iter(&kiocb, iter);
+        if (ret == -EIOCBQUEUED)
+                ret = wait_on_sync_kiocb(&kiocb);
+        if (ret > 0)
+                *ppos = kiocb.ki_pos;
+        return ret;
+}
+EXPORT_SYMBOL(vfs_iter_write);
 /*
 * rw_verify_area doesn't like huge counts. We limit
 * them to something that fits in "int" so that others
@@ -358,7 +404,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
                        return retval;
        }
-        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
+        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
                retval = locks_mandatory_area(
                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
                        inode, file, pos, count);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a7eec9888f10..e72401e1f995 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2766,7 +2766,7 @@ static int reiserfs_write_begin(struct file *file,
        int old_ref = 0;
        inode = mapping->host;
-        *fsdata = 0;
+        *fsdata = NULL;
        if (flags & AOP_FLAG_CONT_EXPAND &&
            (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
                pos ++;
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
 }
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+        struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+        if (!mtd)
+                return NOMMU_MAP_COPY;
+        return mtd_mmap_capabilities(mtd);
+}
 const struct file_operations romfs_ro_fops = {
        .llseek                 = generic_file_llseek,
        .read                   = new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
        .splice_read            = generic_file_splice_read,
        .mmap                   = romfs_mmap,
        .get_unmapped_area      = romfs_get_unmapped_area,
+        .mmap_capabilities      = romfs_mmap_capabilities,
 };
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
        case ROMFH_REG:
                i->i_fop = &romfs_ro_fops;
                i->i_data.a_ops = &romfs_aops;
-                if (i->i_sb->s_mtd)
-                        i->i_data.backing_dev_info =
-                                i->i_sb->s_mtd->backing_dev_info;
                if (nextfh & ROMFH_EXEC)
                        mode |= S_IXUGO;
                break;
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
        if (ret == -EINTR) {
                struct restart_block *restart_block;
-                restart_block = &current_thread_info()->restart_block;
+                restart_block = &current->restart_block;
                restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index dbf3a59c86bb..555f82155be8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
        return res;
 }
-int seq_bitmap(struct seq_file *m, const unsigned long *bits,
-                                   unsigned int nr_bits)
-{
-        if (m->count < m->size) {
-                int len = bitmap_scnprintf(m->buf + m->count,
-                                m->size - m->count, bits, nr_bits);
-                if (m->count + len < m->size) {
-                        m->count += len;
-                        return 0;
-                }
-        }
-        seq_set_overflow(m);
-        return -1;
-}
-EXPORT_SYMBOL(seq_bitmap);
-int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
-                unsigned int nr_bits)
-{
-        if (m->count < m->size) {
-                int len = bitmap_scnlistprintf(m->buf + m->count,
-                                m->size - m->count, bits, nr_bits);
-                if (m->count + len < m->size) {
-                        m->count += len;
-                        return 0;
-                }
-        }
-        seq_set_overflow(m);
-        return -1;
-}
-EXPORT_SYMBOL(seq_bitmap_list);
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
        return NULL + (*pos == 0);
diff --git a/fs/splice.c b/fs/splice.c
index 75c6058eabf2..7968da96bebb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
        splice_from_pipe_begin(&sd);
        while (sd.total_len) {
                struct iov_iter from;
-                struct kiocb kiocb;
                size_t left;
                int n, idx;
@@ -1005,29 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                        left -= this_len;
                }
-                /* ... iov_iter */
+                iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
-                from.type = ITER_BVEC | WRITE;
+                              sd.total_len - left);
-                from.bvec = array;
+                ret = vfs_iter_write(out, &from, &sd.pos);
-                from.nr_segs = n;
-                from.count = sd.total_len - left;
-                from.iov_offset = 0;
-                /* ... and iocb */
-                init_sync_kiocb(&kiocb, out);
-                kiocb.ki_pos = sd.pos;
-                kiocb.ki_nbytes = sd.total_len - left;
-                /* now, send it */
-                ret = out->f_op->write_iter(&kiocb, &from);
-                if (-EIOCBQUEUED == ret)
-                        ret = wait_on_sync_kiocb(&kiocb);
                if (ret <= 0)
                        break;
                sd.num_spliced += ret;
                sd.total_len -= ret;
-                *ppos = sd.pos = kiocb.ki_pos;
+                *ppos = sd.pos;
                /* dismiss the fully eaten buffers, adjust the partial one */
                while (ret) {
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..65a53efc1cf4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
 #include "internal.h"
-LIST_HEAD(super_blocks);
+static LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static DEFINE_SPINLOCK(sb_lock);
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
@@ -75,10 +75,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
                return SHRINK_STOP;
        if (sb->s_op->nr_cached_objects)
-                fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
+                fs_objects = sb->s_op->nr_cached_objects(sb, sc);
-        inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
+        inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
-        dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
+        dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects = dentries + inodes + fs_objects + 1;
        if (!total_objects)
                total_objects = 1;
@@ -86,19 +86,23 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
        /* proportion the scan between the caches */
        dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
        inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+        fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
        /*
         * prune the dcache first as the icache is pinned by it, then
         * prune the icache, followed by the filesystem specific caches
+         *
+         * Ensure that we always scan at least one object - memcg kmem
+         * accounting uses this to fully empty the caches.
         */
-        freed = prune_dcache_sb(sb, dentries, sc->nid);
+        sc->nr_to_scan = dentries + 1;
-        freed += prune_icache_sb(sb, inodes, sc->nid);
+        freed = prune_dcache_sb(sb, sc);
+        sc->nr_to_scan = inodes + 1;
+        freed += prune_icache_sb(sb, sc);
        if (fs_objects) {
-                fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
+                sc->nr_to_scan = fs_objects + 1;
-                                                                total_objects);
+                freed += sb->s_op->free_cached_objects(sb, sc);
-                freed += sb->s_op->free_cached_objects(sb, fs_objects,
-                                                       sc->nid);
        }
        drop_super(sb);
@@ -118,17 +122,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
         * scalability bottleneck. The counts could get updated
         * between super_cache_count and super_cache_scan anyway.
         * Call to super_cache_count with shrinker_rwsem held
-         * ensures the safety of call to list_lru_count_node() and
+         * ensures the safety of call to list_lru_shrink_count() and
         * s_op->nr_cached_objects().
         */
        if (sb->s_op && sb->s_op->nr_cached_objects)
-                total_objects = sb->s_op->nr_cached_objects(sb,
+                total_objects = sb->s_op->nr_cached_objects(sb, sc);
-                                                 sc->nid);
-        total_objects += list_lru_count_node(&sb->s_dentry_lru,
+        total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
-                                                 sc->nid);
+        total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
-        total_objects += list_lru_count_node(&sb->s_inode_lru,
-                                                 sc->nid);
        total_objects = vfs_pressure_ratio(total_objects);
        return total_objects;
@@ -185,15 +186,15 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        }
        init_waitqueue_head(&s->s_writers.wait);
        init_waitqueue_head(&s->s_writers.wait_unfrozen);
+        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
-        s->s_bdi = &default_backing_dev_info;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_anon);
        INIT_LIST_HEAD(&s->s_inodes);
-        if (list_lru_init(&s->s_dentry_lru))
+        if (list_lru_init_memcg(&s->s_dentry_lru))
                goto fail;
-        if (list_lru_init(&s->s_inode_lru))
+        if (list_lru_init_memcg(&s->s_inode_lru))
                goto fail;
        init_rwsem(&s->s_umount);
@@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        s->s_shrink.scan_objects = super_cache_scan;
        s->s_shrink.count_objects = super_cache_count;
        s->s_shrink.batch = 1024;
-        s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+        s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
        return s;
 fail:
@@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s)
                unregister_shrinker(&s->s_shrink);
                fs->kill_sb(s);
+                /*
+                 * Since list_lru_destroy() may sleep, we cannot call it from
+                 * put_super(), where we hold the sb_lock. Therefore we destroy
+                 * the lru lists right now.
+                 */
+                list_lru_destroy(&s->s_dentry_lru);
+                list_lru_destroy(&s->s_inode_lru);
                put_filesystem(fs);
                put_super(s);
        } else {
@@ -706,9 +715,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
        if (remount_ro) {
-                if (sb->s_pins.first) {
+                if (!hlist_empty(&sb->s_pins)) {
                        up_write(&sb->s_umount);
-                        sb_pin_kill(sb);
+                        group_pin_kill(&sb->s_pins);
                        down_write(&sb->s_umount);
                        if (!sb->s_root)
                                return 0;
@@ -863,10 +872,7 @@ EXPORT_SYMBOL(free_anon_bdev);
 int set_anon_super(struct super_block *s, void *data)
 {
-        int error = get_anon_bdev(&s->s_dev);
+        return get_anon_bdev(&s->s_dev);
-        if (!error)
-                s->s_bdi = &noop_backing_dev_info;
-        return error;
 }
 EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1117,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
        sb = root->d_sb;
        BUG_ON(!sb);
        WARN_ON(!sb->s_bdi);
-        WARN_ON(sb->s_bdi == &default_backing_dev_info);
        sb->s_flags |= MS_BORN;
        error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
+        struct inode *inode = file->f_mapping->host;
        if (!file->f_op->fsync)
                return -EINVAL;
+        if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+                spin_lock(&inode->i_lock);
+                inode->i_state &= ~I_DIRTY_TIME;
+                spin_unlock(&inode->i_lock);
+                mark_inode_dirty_sync(inode);
+        }
        return file->f_op->fsync(file, start, end, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dfe928a9540f..7c2867b44141 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
                key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
        kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
-                                  (void *)attr, ns, true, key);
+                                  (void *)attr, ns, key);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, attr->name);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 7d2a860ba788..2554d8835b48 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -99,7 +99,7 @@ static int internal_create_group(struct kobject *kobj, int update,
                return -EINVAL;
        if (!grp->attrs && !grp->bin_attrs) {
                WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
-                        kobj->name, grp->name ? "" : grp->name);
+                        kobj->name, grp->name ?: "");
                return -EINVAL;
        }
        if (grp->name) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7ed13e1e216a..4cfb3e82c56f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2032,6 +2032,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                long long blk_offs;
                struct ubifs_data_node *dn = node;
+                ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ);
                /*
                 * Search the inode node this data node belongs to and insert
                 * it to the RB-tree of inodes.
@@ -2060,6 +2062,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                struct ubifs_dent_node *dent = node;
                struct fsck_inode *fscki1;
+                ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ);
                err = ubifs_validate_entry(c, dent);
                if (err)
                        goto out_dump;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..0fa6c803992e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                         ubifs_current_time(inode);
        inode->i_mapping->nrpages = 0;
-        /* Disable readahead */
-        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (mode & S_IFMT) {
        case S_IFREG:
@@ -272,6 +270,10 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                goto out_budg;
        }
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
@@ -728,6 +730,10 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                goto out_budg;
        }
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        insert_inode_hash(inode);
        inc_nlink(inode);
@@ -808,6 +814,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
        ui->data = dev;
        ui->data_len = devlen;
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
@@ -884,6 +894,10 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
        ui->data_len = len;
        inode->i_size = ubifs_inode(inode)->ui_size = len;
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..e627c0acf626 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
        .fault        = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = ubifs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1574,6 +1573,10 @@ const struct inode_operations ubifs_symlink_inode_operations = {
        .follow_link = ubifs_follow_link,
        .setattr     = ubifs_setattr,
        .getattr     = ubifs_getattr,
+        .setxattr    = ubifs_setxattr,
+        .getxattr    = ubifs_getxattr,
+        .listxattr   = ubifs_listxattr,
+        .removexattr = ubifs_removexattr,
 };
 const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 3187925e9879..9b40a1c5e160 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1028,9 +1028,22 @@ int ubifs_replay_journal(struct ubifs_info *c)
        do {
                err = replay_log_leb(c, lnum, 0, c->sbuf);
-                if (err == 1)
+                if (err == 1) {
-                        /* We hit the end of the log */
+                        if (lnum != c->lhead_lnum)
-                        break;
+                                /* We hit the end of the log */
+                                break;
+                        /*
+                         * The head of the log must always start with the
+                         * "commit start" node on a properly formatted UBIFS.
+                         * But we found no nodes at all, which means that
+                         * someting went wrong and we cannot proceed mounting
+                         * the file-system.
+                         */
+                        ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
+                                  lnum, 0);
+                        err = -EINVAL;
+                }
                if (err)
                        goto out;
                lnum = ubifs_next_log_lnum(c, lnum);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..93e946561c5c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
        if (err)
                goto out_invalid;
-        /* Disable read-ahead */
-        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
         */
        c->bdi.name = "ubifs",
-        c->bdi.capabilities = BDI_CAP_MAP_COPY;
+        c->bdi.capabilities = 0;
        err  = bdi_init(&c->bdi);
        if (err)
                goto out_close;
@@ -2039,6 +2036,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        if (c->max_inode_sz > MAX_LFS_FILESIZE)
                sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
        sb->s_op = &ubifs_super_operations;
+        sb->s_xattr = ubifs_xattr_handlers;
        mutex_lock(&c->umount_mutex);
        err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c4fe900c67ab..bc04b9c69891 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -36,6 +36,7 @@
 #include <linux/mtd/ubi.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
+#include <linux/security.h>
 #include "ubifs-media.h"
 /* Version of this UBIFS implementation */
@@ -1465,6 +1466,7 @@ extern spinlock_t ubifs_infos_lock;
 extern atomic_long_t ubifs_clean_zn_cnt;
 extern struct kmem_cache *ubifs_inode_slab;
 extern const struct super_operations ubifs_super_operations;
+extern const struct xattr_handler *ubifs_xattr_handlers[];
 extern const struct address_space_operations ubifs_file_address_operations;
 extern const struct file_operations ubifs_file_operations;
 extern const struct inode_operations ubifs_file_inode_operations;
@@ -1754,6 +1756,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
                       size_t size);
 ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int ubifs_removexattr(struct dentry *dentry, const char *name);
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+                        const struct qstr *qstr);
 /* super.c */
 struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 5e0a63b1b0d5..a92be244a6fb 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -100,24 +100,30 @@ static const struct file_operations empty_fops;
 static int create_xattr(struct ubifs_info *c, struct inode *host,
                        const struct qstr *nm, const void *value, int size)
 {
-        int err;
+        int err, names_len;
        struct inode *inode;
        struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
                                .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
-        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
+                ubifs_err("inode %lu already has too many xattrs (%d), cannot create more",
+                          host->i_ino, host_ui->xattr_cnt);
                return -ENOSPC;
+        }
        /*
         * Linux limits the maximum size of the extended attribute names list
         * to %XATTR_LIST_MAX. This means we should not allow creating more
         * extended attributes if the name list becomes larger. This limitation
         * is artificial for UBIFS, though.
         */
-        if (host_ui->xattr_names + host_ui->xattr_cnt +
+        names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
-                                        nm->len + 1 > XATTR_LIST_MAX)
+        if (names_len > XATTR_LIST_MAX) {
+                ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
+                          host->i_ino, names_len, XATTR_LIST_MAX);
                return -ENOSPC;
+        }
        err = ubifs_budget_space(c, &req);
        if (err)
@@ -293,18 +299,16 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
        return ERR_PTR(-EINVAL);
 }
-int ubifs_setxattr(struct dentry *dentry, const char *name,
+static int setxattr(struct inode *host, const char *name, const void *value,
-                   const void *value, size_t size, int flags)
+                    size_t size, int flags)
 {
-        struct inode *inode, *host = dentry->d_inode;
+        struct inode *inode;
        struct ubifs_info *c = host->i_sb->s_fs_info;
        struct qstr nm = QSTR_INIT(name, strlen(name));
        struct ubifs_dent_node *xent;
        union ubifs_key key;
        int err, type;
-        dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name,
-                host->i_ino, dentry, size);
        ubifs_assert(mutex_is_locked(&host->i_mutex));
        if (size > UBIFS_MAX_INO_DATA)
@@ -356,6 +360,15 @@ out_free:
        return err;
 }
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+                   const void *value, size_t size, int flags)
+{
+        dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
+                name, dentry->d_inode->i_ino, dentry, size);
+        return setxattr(dentry->d_inode, name, value, size, flags);
+}
 ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
                       size_t size)
 {
@@ -568,3 +581,84 @@ out_free:
        kfree(xent);
        return err;
 }
+static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
+                                 const char *name, size_t name_len, int flags)
+{
+        const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int security_getxattr(struct dentry *d, const char *name, void *buffer,
+                      size_t size, int flags)
+{
+        return ubifs_getxattr(d, name, buffer, size);
+}
+static int security_setxattr(struct dentry *d, const char *name,
+                             const void *value, size_t size, int flags,
+                             int handler_flags)
+{
+        return ubifs_setxattr(d, name, value, size, flags);
+}
+static const struct xattr_handler ubifs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = security_listxattr,
+        .get    = security_getxattr,
+        .set    = security_setxattr,
+};
+const struct xattr_handler *ubifs_xattr_handlers[] = {
+        &ubifs_xattr_security_handler,
+        NULL,
+};
+static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
+                      void *fs_info)
+{
+        const struct xattr *xattr;
+        char *name;
+        int err = 0;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+                               strlen(xattr->name) + 1, GFP_NOFS);
+                if (!name) {
+                        err = -ENOMEM;
+                        break;
+                }
+                strcpy(name, XATTR_SECURITY_PREFIX);
+                strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+                err = setxattr(inode, name, xattr->value, xattr->value_len, 0);
+                kfree(name);
+                if (err < 0)
+                        break;
+        }
+        return err;
+}
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+                        const struct qstr *qstr)
+{
+        int err;
+        mutex_lock(&inode->i_mutex);
+        err = security_inode_init_security(inode, dentry, qstr,
+                                           &init_xattrs, 0);
+        mutex_unlock(&inode->i_mutex);
+        if (err)
+                ubifs_err("cannot initialize security for inode %lu, error %d",
+                          inode->i_ino, err);
+        return err;
+}
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..c6e17a744c3b 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -2,10 +2,12 @@ config UDF_FS
        tristate "UDF file system support"
        select CRC_ITU_T
        help
-          This is the new file system used on some CD-ROMs and DVDs. Say Y if
+          This is a file system used on some CD-ROMs and DVDs. Since the
-          you intend to mount DVD discs or CDRW's written in packet mode, or
+          file system is supported by multiple operating systems and is more
-          if written to by other UDF utilities, such as DirectCD.
+          compatible with standard unix file systems, it is also suitable for
-          Please read <file:Documentation/filesystems/udf.txt>.
+          removable USB disks. Say Y if you intend to mount DVD discs or CDRW's
+          written in packet mode, or if you want to use UDF for removable USB
+          disks. Please read <file:Documentation/filesystems/udf.txt>.
          To compile this file system support as a module, choose M here: the
          module will be called udf.
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5bc71d9a674a..a445d599098d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -750,7 +750,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
        /* Are we beyond EOF? */
        if (etype == -1) {
                int ret;
-                isBeyondEOF = 1;
+                isBeyondEOF = true;
                if (count) {
                        if (c)
                                laarr[0] = laarr[1];
@@ -792,7 +792,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
                endnum = c + 1;
                lastblock = 1;
        } else {
-                isBeyondEOF = 0;
+                isBeyondEOF = false;
                endnum = startnum = ((count > 2) ? 2 : count);
                /* if the current extent is in position 0,
@@ -1288,6 +1288,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
        struct kernel_lb_addr *iloc = &iinfo->i_location;
        unsigned int link_count;
        unsigned int indirections = 0;
+        int bs = inode->i_sb->s_blocksize;
        int ret = -EIO;
 reread:
@@ -1374,38 +1375,35 @@ reread:
        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
                iinfo->i_efe = 1;
                iinfo->i_use = 0;
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct extendedFileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct extendedFileEntry),
-                       inode->i_sb->s_blocksize -
+                       bs - sizeof(struct extendedFileEntry));
-                                        sizeof(struct extendedFileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 0;
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
-                                                sizeof(struct fileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct fileEntry),
-                       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+                       bs - sizeof(struct fileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 1;
                iinfo->i_lenAlloc = le32_to_cpu(
                                ((struct unallocSpaceEntry *)bh->b_data)->
                                 lengthAllocDescs);
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct unallocSpaceEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct unallocSpaceEntry),
-                       inode->i_sb->s_blocksize -
+                       bs - sizeof(struct unallocSpaceEntry));
-                                        sizeof(struct unallocSpaceEntry));
                return 0;
        }
@@ -1489,6 +1487,15 @@ reread:
        }
        inode->i_generation = iinfo->i_unique;
+        /*
+         * Sanity check length of allocation descriptors and extended attrs to
+         * avoid integer overflows
+         */
+        if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
+                goto out;
+        /* Now do exact checks */
+        if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
+                goto out;
        /* Sanity checks for files in ICB so that we don't get confused later */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                /*
@@ -1498,8 +1505,7 @@ reread:
                if (iinfo->i_lenAlloc != inode->i_size)
                        goto out;
                /* File in ICB has to fit in there... */
-                if (inode->i_size > inode->i_sb->s_blocksize -
+                if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
-                                        udf_file_entry_alloc_offset(inode))
                        goto out;
        }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3ccb2f11fc76..f169411c4ea0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1599,7 +1599,7 @@ static noinline int udf_process_sequence(
        struct udf_vds_record *curr;
        struct generic_desc *gd;
        struct volDescPtr *vdp;
-        int done = 0;
+        bool done = false;
        uint32_t vdsn;
        uint16_t ident;
        long next_s = 0, next_e = 0;
@@ -1680,7 +1680,7 @@ static noinline int udf_process_sequence(
                                lastblock = next_e;
                                next_s = next_e = 0;
                        } else
-                                done = 1;
+                                done = true;
                        break;
                }
                brelse(bh);
@@ -2300,6 +2300,7 @@ static void udf_put_super(struct super_block *sb)
                udf_close_lvid(sb);
        brelse(sbi->s_lvid_bh);
        udf_sb_free_partitions(sb);
+        mutex_destroy(&sbi->s_alloc_mutex);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
 }
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index da73801301d5..8092d3759a5e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -95,22 +95,18 @@
 void lock_ufs(struct super_block *sb)
 {
-#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
        struct ufs_sb_info *sbi = UFS_SB(sb);
        mutex_lock(&sbi->mutex);
        sbi->mutex_owner = current;
-#endif
 }
 void unlock_ufs(struct super_block *sb)
 {
-#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
        struct ufs_sb_info *sbi = UFS_SB(sb);
        sbi->mutex_owner = NULL;
        mutex_unlock(&sbi->mutex);
-#endif
 }
 static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
@@ -1415,9 +1411,11 @@ static struct kmem_cache * ufs_inode_cachep;
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
        struct ufs_inode_info *ei;
-        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+        ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        ei->vfs_inode.i_version = 1;
        return &ei->vfs_inode;
 }
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 53e95b2a1369..a7a3a63bb360 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -91,16 +91,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
        return ptr;
 }
-void
-kmem_free(const void *ptr)
-{
-        if (!is_vmalloc_addr(ptr)) {
-                kfree(ptr);
-        } else {
-                vfree(ptr);
-        }
-}
 void *
 kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
             xfs_km_flags_t flags)
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 64db0e53edea..cc6b768fc068 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -63,7 +63,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
-extern void  kmem_free(const void *);
+static inline void  kmem_free(const void *ptr)
+{
+        kvfree(ptr);
+}
 extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 5d38e8b8a913..15105dbc9e28 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -403,7 +403,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
                if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
                        xfs_sb_version_addattr2(&mp->m_sb);
                        spin_unlock(&mp->m_sb_lock);
-                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                        xfs_log_sb(tp);
                } else
                        spin_unlock(&mp->m_sb_lock);
        }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b5eb4743f75a..61ec015dca16 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -973,7 +973,11 @@ xfs_bmap_local_to_extents(
        *firstblock = args.fsbno;
        bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-        /* initialise the block and copy the data */
+        /*
+         * Initialise the block and copy the data
+         *
+         * Note: init_fn must set the buffer log item type correctly!
+         */
        init_fn(tp, bp, ip, ifp);
        /* account for the change in fork size and log everything */
@@ -1221,22 +1225,20 @@ xfs_bmap_add_attrfork(
                goto bmap_cancel;
        if (!xfs_sb_version_hasattr(&mp->m_sb) ||
           (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
-                __int64_t sbfields = 0;
+                bool log_sb = false;
                spin_lock(&mp->m_sb_lock);
                if (!xfs_sb_version_hasattr(&mp->m_sb)) {
                        xfs_sb_version_addattr(&mp->m_sb);
-                        sbfields |= XFS_SB_VERSIONNUM;
+                        log_sb = true;
                }
                if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
                        xfs_sb_version_addattr2(&mp->m_sb);
-                        sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                        log_sb = true;
                }
-                if (sbfields) {
+                spin_unlock(&mp->m_sb_lock);
-                        spin_unlock(&mp->m_sb_lock);
+                if (log_sb)
-                        xfs_mod_sb(tp, sbfields);
+                        xfs_log_sb(tp);
-                } else
-                        spin_unlock(&mp->m_sb_lock);
        }
        error = xfs_bmap_finish(&tp, &flist, &committed);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 44db6db86402..b9d8a499d2c4 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -28,6 +28,37 @@ struct xfs_trans;
 extern kmem_zone_t      *xfs_bmap_free_item_zone;
 /*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
+        struct xfs_bmap_free    *flist; /* bmap freelist */
+        struct xfs_trans        *tp;    /* transaction pointer */
+        struct xfs_inode        *ip;    /* incore inode pointer */
+        struct xfs_bmbt_irec    prev;   /* extent before the new one */
+        struct xfs_bmbt_irec    got;    /* extent after, or delayed */
+        xfs_fileoff_t           offset; /* offset in file filling in */
+        xfs_extlen_t            length; /* i/o length asked/allocated */
+        xfs_fsblock_t           blkno;  /* starting block of new extent */
+        struct xfs_btree_cur    *cur;   /* btree cursor */
+        xfs_extnum_t            idx;    /* current extent index */
+        int                     nallocs;/* number of extents alloc'd */
+        int                     logflags;/* flags for transaction logging */
+        xfs_extlen_t            total;  /* total blocks needed for xaction */
+        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
+        xfs_extlen_t            minleft; /* amount must be left after alloc */
+        bool                    eof;    /* set if allocating past last extent */
+        bool                    wasdel; /* replacing a delayed allocation */
+        bool                    userdata;/* set if is user data */
+        bool                    aeof;   /* allocated space at eof */
+        bool                    conv;   /* overwriting unwritten extents */
+        int                     flags;
+};
+/*
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
@@ -149,6 +180,8 @@ void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
 void    xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
                struct xfs_bmap_free *flist, struct xfs_mount *mp);
 void    xfs_bmap_cancel(struct xfs_bmap_free *flist);
+int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+                        int *committed);
 void    xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
 int     xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fbd6da263571..8eb718979383 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -151,10 +151,13 @@ typedef struct xfs_sb {
        __uint32_t      sb_features2;   /* additional feature bits */
        /*
-         * bad features2 field as a result of failing to pad the sb
+         * bad features2 field as a result of failing to pad the sb structure to
-         * structure to 64 bits. Some machines will be using this field
+         * 64 bits. Some machines will be using this field for features2 bits.
-         * for features2 bits. Easiest just to mark it bad and not use
+         * Easiest just to mark it bad and not use it for anything else.
-         * it for anything else.
+         *
+         * This is not kept up to date in memory; it is always overwritten by
+         * the value in sb_features2 when formatting the incore superblock to
+         * the disk buffer.
         */
        __uint32_t      sb_bad_features2;
@@ -304,8 +307,8 @@ typedef enum {
 #define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
 #define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
 #define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2        XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_FEATURES2        (XFS_SB_MVAL(FEATURES2) | \
-#define XFS_SB_BAD_FEATURES2    XFS_SB_MVAL(BAD_FEATURES2)
+                                 XFS_SB_MVAL(BAD_FEATURES2))
 #define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
 #define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
 #define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
@@ -319,9 +322,9 @@ typedef enum {
         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-         XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+         XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
-         XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+         XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
-         XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
+         XFS_SB_PQUOTINO)
 /*
@@ -453,13 +456,11 @@ static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
 {
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
 }
 static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
 {
        sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
        if (!sbp->sb_features2)
                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
@@ -475,7 +476,6 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
 {
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
 }
 /*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 752915fa775a..b0a5fe95a3e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -40,69 +40,6 @@
 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
 */
-static const struct {
-        short offset;
-        short type;     /* 0 = integer
-                         * 1 = binary / string (no translation)
-                         */
-} xfs_sb_info[] = {
-        { offsetof(xfs_sb_t, sb_magicnum),      0 },
-        { offsetof(xfs_sb_t, sb_blocksize),     0 },
-        { offsetof(xfs_sb_t, sb_dblocks),       0 },
-        { offsetof(xfs_sb_t, sb_rblocks),       0 },
-        { offsetof(xfs_sb_t, sb_rextents),      0 },
-        { offsetof(xfs_sb_t, sb_uuid),          1 },
-        { offsetof(xfs_sb_t, sb_logstart),      0 },
-        { offsetof(xfs_sb_t, sb_rootino),       0 },
-        { offsetof(xfs_sb_t, sb_rbmino),        0 },
-        { offsetof(xfs_sb_t, sb_rsumino),       0 },
-        { offsetof(xfs_sb_t, sb_rextsize),      0 },
-        { offsetof(xfs_sb_t, sb_agblocks),      0 },
-        { offsetof(xfs_sb_t, sb_agcount),       0 },
-        { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
-        { offsetof(xfs_sb_t, sb_logblocks),     0 },
-        { offsetof(xfs_sb_t, sb_versionnum),    0 },
-        { offsetof(xfs_sb_t, sb_sectsize),      0 },
-        { offsetof(xfs_sb_t, sb_inodesize),     0 },
-        { offsetof(xfs_sb_t, sb_inopblock),     0 },
-        { offsetof(xfs_sb_t, sb_fname[0]),      1 },
-        { offsetof(xfs_sb_t, sb_blocklog),      0 },
-        { offsetof(xfs_sb_t, sb_sectlog),       0 },
-        { offsetof(xfs_sb_t, sb_inodelog),      0 },
-        { offsetof(xfs_sb_t, sb_inopblog),      0 },
-        { offsetof(xfs_sb_t, sb_agblklog),      0 },
-        { offsetof(xfs_sb_t, sb_rextslog),      0 },
-        { offsetof(xfs_sb_t, sb_inprogress),    0 },
-        { offsetof(xfs_sb_t, sb_imax_pct),      0 },
-        { offsetof(xfs_sb_t, sb_icount),        0 },
-        { offsetof(xfs_sb_t, sb_ifree),         0 },
-        { offsetof(xfs_sb_t, sb_fdblocks),      0 },
-        { offsetof(xfs_sb_t, sb_frextents),     0 },
-        { offsetof(xfs_sb_t, sb_uquotino),      0 },
-        { offsetof(xfs_sb_t, sb_gquotino),      0 },
-        { offsetof(xfs_sb_t, sb_qflags),        0 },
-        { offsetof(xfs_sb_t, sb_flags),         0 },
-        { offsetof(xfs_sb_t, sb_shared_vn),     0 },
-        { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
-        { offsetof(xfs_sb_t, sb_unit),          0 },
-        { offsetof(xfs_sb_t, sb_width),         0 },
-        { offsetof(xfs_sb_t, sb_dirblklog),     0 },
-        { offsetof(xfs_sb_t, sb_logsectlog),    0 },
-        { offsetof(xfs_sb_t, sb_logsectsize),   0 },
-        { offsetof(xfs_sb_t, sb_logsunit),      0 },
-        { offsetof(xfs_sb_t, sb_features2),     0 },
-        { offsetof(xfs_sb_t, sb_bad_features2), 0 },
-        { offsetof(xfs_sb_t, sb_features_compat),       0 },
-        { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
-        { offsetof(xfs_sb_t, sb_features_incompat),     0 },
-        { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
-        { offsetof(xfs_sb_t, sb_crc),           0 },
-        { offsetof(xfs_sb_t, sb_pad),           0 },
-        { offsetof(xfs_sb_t, sb_pquotino),      0 },
-        { offsetof(xfs_sb_t, sb_lsn),           0 },
-        { sizeof(xfs_sb_t),                     0 }
-};
 /*
 * Reference counting access wrappers to the perag structures.
 * Because we never free per-ag structures, the only thing we
@@ -461,58 +398,49 @@ xfs_sb_from_disk(
        __xfs_sb_from_disk(to, from, true);
 }
-static inline void
+static void
 xfs_sb_quota_to_disk(
-        xfs_dsb_t       *to,
+        struct xfs_dsb  *to,
-        xfs_sb_t        *from,
+        struct xfs_sb   *from)
-        __int64_t       *fields)
 {
        __uint16_t      qflags = from->sb_qflags;
+        to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
+        if (xfs_sb_version_has_pquotino(from)) {
+                to->sb_qflags = cpu_to_be16(from->sb_qflags);
+                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+                to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
+                return;
+        }
        /*
-         * We need to do these manipilations only if we are working
+         * The in-core version of sb_qflags do not have XFS_OQUOTA_*
-         * with an older version of on-disk superblock.
+         * flags, whereas the on-disk version does.  So, convert incore
+         * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
         */
-        if (xfs_sb_version_has_pquotino(from))
+        qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                return;
+                        XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-        if (*fields & XFS_SB_QFLAGS) {
+        if (from->sb_qflags &
-                /*
+                        (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                 * The in-core version of sb_qflags do not have
+                qflags |= XFS_OQUOTA_ENFD;
-                 * XFS_OQUOTA_* flags, whereas the on-disk version
+        if (from->sb_qflags &
-                 * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                        (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                 * to on-disk XFS_OQUOTA_* flags.
+                qflags |= XFS_OQUOTA_CHKD;
-                 */
+        to->sb_qflags = cpu_to_be16(qflags);
-                qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                                XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-                if (from->sb_qflags &
-                                (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                        qflags |= XFS_OQUOTA_ENFD;
-                if (from->sb_qflags &
-                                (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                        qflags |= XFS_OQUOTA_CHKD;
-                to->sb_qflags = cpu_to_be16(qflags);
-                *fields &= ~XFS_SB_QFLAGS;
-        }
        /*
-         * GQUOTINO and PQUOTINO cannot be used together in versions of
+         * GQUOTINO and PQUOTINO cannot be used together in versions
-         * superblock that do not have pquotino. from->sb_flags tells us which
+         * of superblock that do not have pquotino. from->sb_flags
-         * quota is active and should be copied to disk. If neither are active,
+         * tells us which quota is active and should be copied to
-         * make sure we write NULLFSINO to the sb_gquotino field as a quota
+         * disk. If neither are active, we should NULL the inode.
-         * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
-         * bit is set.
         *
-         * Note that we don't need to handle the sb_uquotino or sb_pquotino here
+         * In all cases, the separate pquotino must remain 0 because it
-         * as they do not require any translation. Hence the main sb field loop
+         * it beyond the "end" of the valid non-pquotino superblock.
-         * will write them appropriately from the in-core superblock.
         */
-        if ((*fields & XFS_SB_GQUOTINO) &&
+        if (from->sb_qflags & XFS_GQUOTA_ACCT)
-                                (from->sb_qflags & XFS_GQUOTA_ACCT))
                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
-        else if ((*fields & XFS_SB_PQUOTINO) &&
+        else if (from->sb_qflags & XFS_PQUOTA_ACCT)
-                                (from->sb_qflags & XFS_PQUOTA_ACCT))
                to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
        else {
                /*
@@ -526,63 +454,78 @@ xfs_sb_quota_to_disk(
                        to->sb_gquotino = cpu_to_be64(NULLFSINO);
        }
-        *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+        to->sb_pquotino = 0;
 }
-/*
- * Copy in core superblock to ondisk one.
- *
- * The fields argument is mask of superblock fields to copy.
- */
 void
 xfs_sb_to_disk(
-        xfs_dsb_t       *to,
+        struct xfs_dsb  *to,
-        xfs_sb_t        *from,
+        struct xfs_sb   *from)
-        __int64_t       fields)
 {
-        xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
+        xfs_sb_quota_to_disk(to, from);
-        xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
-        xfs_sb_field_t  f;
-        int             first;
-        int             size;
-        ASSERT(fields);
-        if (!fields)
-                return;
-        /* We should never write the crc here, it's updated in the IO path */
+        to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
-        fields &= ~XFS_SB_CRC;
+        to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
+        to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
-        xfs_sb_quota_to_disk(to, from, &fields);
+        to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
-        while (fields) {
+        to->sb_rextents = cpu_to_be64(from->sb_rextents);
-                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
-                first = xfs_sb_info[f].offset;
+        to->sb_logstart = cpu_to_be64(from->sb_logstart);
-                size = xfs_sb_info[f + 1].offset - first;
+        to->sb_rootino = cpu_to_be64(from->sb_rootino);
+        to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
-                ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+        to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
+        to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
-                if (size == 1 || xfs_sb_info[f].type == 1) {
+        to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
-                        memcpy(to_ptr + first, from_ptr + first, size);
+        to->sb_agcount = cpu_to_be32(from->sb_agcount);
-                } else {
+        to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
-                        switch (size) {
+        to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
-                        case 2:
+        to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
-                                *(__be16 *)(to_ptr + first) =
+        to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
-                                      cpu_to_be16(*(__u16 *)(from_ptr + first));
+        to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
-                                break;
+        to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
-                        case 4:
+        memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
-                                *(__be32 *)(to_ptr + first) =
+        to->sb_blocklog = from->sb_blocklog;
-                                      cpu_to_be32(*(__u32 *)(from_ptr + first));
+        to->sb_sectlog = from->sb_sectlog;
-                                break;
+        to->sb_inodelog = from->sb_inodelog;
-                        case 8:
+        to->sb_inopblog = from->sb_inopblog;
-                                *(__be64 *)(to_ptr + first) =
+        to->sb_agblklog = from->sb_agblklog;
-                                      cpu_to_be64(*(__u64 *)(from_ptr + first));
+        to->sb_rextslog = from->sb_rextslog;
-                                break;
+        to->sb_inprogress = from->sb_inprogress;
-                        default:
+        to->sb_imax_pct = from->sb_imax_pct;
-                                ASSERT(0);
+        to->sb_icount = cpu_to_be64(from->sb_icount);
-                        }
+        to->sb_ifree = cpu_to_be64(from->sb_ifree);
-                }
+        to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
+        to->sb_frextents = cpu_to_be64(from->sb_frextents);
-                fields &= ~(1LL << f);
+        to->sb_flags = from->sb_flags;
+        to->sb_shared_vn = from->sb_shared_vn;
+        to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
+        to->sb_unit = cpu_to_be32(from->sb_unit);
+        to->sb_width = cpu_to_be32(from->sb_width);
+        to->sb_dirblklog = from->sb_dirblklog;
+        to->sb_logsectlog = from->sb_logsectlog;
+        to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
+        to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
+        /*
+         * We need to ensure that bad_features2 always matches features2.
+         * Hence we enforce that here rather than having to remember to do it
+         * everywhere else that updates features2.
+         */
+        from->sb_bad_features2 = from->sb_features2;
+        to->sb_features2 = cpu_to_be32(from->sb_features2);
+        to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
+        if (xfs_sb_version_hascrc(from)) {
+                to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+                to->sb_features_ro_compat =
+                                cpu_to_be32(from->sb_features_ro_compat);
+                to->sb_features_incompat =
+                                cpu_to_be32(from->sb_features_incompat);
+                to->sb_features_log_incompat =
+                                cpu_to_be32(from->sb_features_log_incompat);
+                to->sb_pad = 0;
+                to->sb_lsn = cpu_to_be64(from->sb_lsn);
        }
 }
@@ -816,42 +759,51 @@ xfs_initialize_perag_data(
 }
 /*
- * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
- * in-core superblock into the superblock buffer to be logged.
+ * into the superblock buffer to be logged.  It does not provide the higher
- * It does not provide the higher level of locking that is
+ * level of locking that is needed to protect the in-core superblock from
- * needed to protect the in-core superblock from concurrent
+ * concurrent access.
- * access.
 */
 void
-xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+xfs_log_sb(
+        struct xfs_trans        *tp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_mount        *mp = tp->t_mountp;
-        int             first;
+        struct xfs_buf          *bp = xfs_trans_getsb(tp, mp, 0);
-        int             last;
-        xfs_mount_t     *mp;
-        xfs_sb_field_t  f;
-        ASSERT(fields);
-        if (!fields)
-                return;
-        mp = tp->t_mountp;
-        bp = xfs_trans_getsb(tp, mp, 0);
-        first = sizeof(xfs_sb_t);
-        last = 0;
-        /* translate/copy */
-        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+        xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+}
-        /* find modified range */
+/*
-        f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+ * xfs_sync_sb
-        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+ *
-        last = xfs_sb_info[f + 1].offset - 1;
+ * Sync the superblock to disk.
+ *
+ * Note that the caller is responsible for checking the frozen state of the
+ * filesystem. This procedure uses the non-blocking transaction allocator and
+ * thus will allow modifications to a frozen fs. This is required because this
+ * code can be called during the process of freezing where use of the high-level
+ * allocator would deadlock.
+ */
+int
+xfs_sync_sb(
+        struct xfs_mount        *mp,
+        bool                    wait)
+{
+        struct xfs_trans        *tp;
+        int                     error;
-        f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
-        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        first = xfs_sb_info[f].offset;
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
-        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+        xfs_log_sb(tp);
-        xfs_trans_log_buf(tp, bp, first, last);
+        if (wait)
+                xfs_trans_set_sync(tp);
+        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 8eb1c54bafbf..b25bb9a343f3 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,11 +27,12 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
 extern void     xfs_perag_put(struct xfs_perag *pag);
 extern int      xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-extern void     xfs_sb_calc_crc(struct xfs_buf  *);
+extern void     xfs_sb_calc_crc(struct xfs_buf *bp);
-extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void     xfs_log_sb(struct xfs_trans *tp);
-extern void     xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern int      xfs_sync_sb(struct xfs_mount *mp, bool wait);
-extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
-extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void     xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
+extern void     xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
 extern void     xfs_sb_quota_from_disk(struct xfs_sb *sbp);
 #endif  /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..8dda4b321343 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -82,7 +82,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 #define XFS_TRANS_ATTR_RM               23
 #define XFS_TRANS_ATTR_FLAG             24
 #define XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_QM_SBCHANGE           26
+#define XFS_TRANS_SB_CHANGE             26
 /*
 * Dummy entries since we use the transaction type to index into the
 * trans_type[] in xlog_recover_print_trans_head()
@@ -95,17 +95,15 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 #define XFS_TRANS_QM_DQCLUSTER          32
 #define XFS_TRANS_QM_QINOCREATE         33
 #define XFS_TRANS_QM_QUOTAOFF_END       34
-#define XFS_TRANS_SB_UNIT               35
+#define XFS_TRANS_FSYNC_TS              35
-#define XFS_TRANS_FSYNC_TS              36
+#define XFS_TRANS_GROWFSRT_ALLOC        36
-#define XFS_TRANS_GROWFSRT_ALLOC        37
+#define XFS_TRANS_GROWFSRT_ZERO         37
-#define XFS_TRANS_GROWFSRT_ZERO         38
+#define XFS_TRANS_GROWFSRT_FREE         38
-#define XFS_TRANS_GROWFSRT_FREE         39
+#define XFS_TRANS_SWAPEXT               39
-#define XFS_TRANS_SWAPEXT               40
+#define XFS_TRANS_CHECKPOINT            40
-#define XFS_TRANS_SB_COUNT              41
+#define XFS_TRANS_ICREATE               41
-#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_CREATE_TMPFILE        42
-#define XFS_TRANS_ICREATE               43
+#define XFS_TRANS_TYPE_MAX              43
-#define XFS_TRANS_CREATE_TMPFILE        44
-#define XFS_TRANS_TYPE_MAX              44
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -113,7 +111,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
        { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
        { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
        { XFS_TRANS_CREATE,             "CREATE" }, \
-        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
        { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
        { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
        { XFS_TRANS_REMOVE,             "REMOVE" }, \
@@ -134,23 +131,23 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
        { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
        { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
        { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-        { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+        { XFS_TRANS_SB_CHANGE,          "SBCHANGE" }, \
+        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
        { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
        { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
        { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
        { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
        { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
        { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-        { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
        { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
        { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
        { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_ICREATE,            "ICREATE" }, \
-        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
 /*
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index c80c5236c3da..e7e26bd6468f 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -178,6 +178,8 @@ xfs_symlink_local_to_remote(
        struct xfs_mount        *mp = ip->i_mount;
        char                    *buf;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
        if (!xfs_sb_version_hascrc(&mp->m_sb)) {
                bp->b_ops = NULL;
                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6c1330f29050..68cb1e7bf2bb 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -716,17 +716,6 @@ xfs_calc_clear_agi_bucket_reservation(
 }
 /*
- * Clearing the quotaflags in the superblock.
- *      the super block for changing quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_sbchange_reservation(
-        struct xfs_mount        *mp)
-{
-        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-/*
 * Adjusting quota limits.
 *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
 */
@@ -864,9 +853,6 @@ xfs_trans_resv_calc(
         * The following transactions are logged in logical format with
         * a default log count.
         */
-        resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
-        resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
        resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
        resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..2d5bdfce6d8f 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -56,7 +56,6 @@ struct xfs_trans_resv {
        struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
        struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
        struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
-        struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
        struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
        struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
        struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b79dc66b2ecd..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 18e2f3bbae5e..3a9b7a1b8704 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -135,30 +135,22 @@ xfs_setfilesize_trans_alloc(
 */
 STATIC int
 xfs_setfilesize(
-        struct xfs_ioend        *ioend)
+        struct xfs_inode        *ip,
+        struct xfs_trans        *tp,
+        xfs_off_t               offset,
+        size_t                  size)
 {
-        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
-        struct xfs_trans        *tp = ioend->io_append_trans;
        xfs_fsize_t             isize;
-        /*
-         * The transaction may have been allocated in the I/O submission thread,
-         * thus we need to mark ourselves as beeing in a transaction manually.
-         * Similarly for freeze protection.
-         */
-        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                           0, 1, _THIS_IP_);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+        isize = xfs_new_eof(ip, offset + size);
        if (!isize) {
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                xfs_trans_cancel(tp, 0);
                return 0;
        }
-        trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+        trace_xfs_setfilesize(ip, offset, size);
        ip->i_d.di_size = isize;
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -167,6 +159,25 @@ xfs_setfilesize(
        return xfs_trans_commit(tp, 0);
 }
+STATIC int
+xfs_setfilesize_ioend(
+        struct xfs_ioend        *ioend)
+{
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+        struct xfs_trans        *tp = ioend->io_append_trans;
+        /*
+         * The transaction may have been allocated in the I/O submission thread,
+         * thus we need to mark ourselves as being in a transaction manually.
+         * Similarly for freeze protection.
+         */
+        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                           0, 1, _THIS_IP_);
+        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+}
 /*
 * Schedule IO completion handling on the final put of an ioend.
 *
@@ -182,8 +193,7 @@ xfs_finish_ioend(
                if (ioend->io_type == XFS_IO_UNWRITTEN)
                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-                else if (ioend->io_append_trans ||
+                else if (ioend->io_append_trans)
-                         (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
                        queue_work(mp->m_data_workqueue, &ioend->io_work);
                else
                        xfs_destroy_ioend(ioend);
@@ -215,22 +225,8 @@ xfs_end_io(
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                  ioend->io_size);
-        } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
-                /*
-                 * For direct I/O we do not know if we need to allocate blocks
-                 * or not so we can't preallocate an append transaction as that
-                 * results in nested reservations and log space deadlocks. Hence
-                 * allocate the transaction here. While this is sub-optimal and
-                 * can block IO completion for some time, we're stuck with doing
-                 * it this way until we can pass the ioend to the direct IO
-                 * allocation callbacks and avoid nesting that way.
-                 */
-                error = xfs_setfilesize_trans_alloc(ioend);
-                if (error)
-                        goto done;
-                error = xfs_setfilesize(ioend);
        } else if (ioend->io_append_trans) {
-                error = xfs_setfilesize(ioend);
+                error = xfs_setfilesize_ioend(ioend);
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
@@ -242,17 +238,6 @@ done:
 }
 /*
- * Call IO completion handling in caller context on the final put of an ioend.
- */
-STATIC void
-xfs_finish_ioend_sync(
-        struct xfs_ioend        *ioend)
-{
-        if (atomic_dec_and_test(&ioend->io_remaining))
-                xfs_end_io(&ioend->io_work);
-}
-/*
 * Allocate and initialise an IO completion structure.
 * We need to track unwritten extent write completion here initially.
 * We'll need to extend this for updating the ondisk inode size later
@@ -273,7 +258,6 @@ xfs_alloc_ioend(
         * all the I/O from calling the completion routine too early.
         */
        atomic_set(&ioend->io_remaining, 1);
-        ioend->io_isdirect = 0;
        ioend->io_error = 0;
        ioend->io_list = NULL;
        ioend->io_type = type;
@@ -1459,11 +1443,7 @@ xfs_get_blocks_direct(
 *
 * If the private argument is non-NULL __xfs_get_blocks signals us that we
 * need to issue a transaction to convert the range from unwritten to written
- * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * extents.
- * to do this and we are done.  But in case this was a successful AIO
- * request this handler is called from interrupt context, from which we
- * can't start transactions.  In that case offload the I/O completion to
- * the workqueues we also use for buffered I/O completion.
 */
 STATIC void
 xfs_end_io_direct_write(
@@ -1472,7 +1452,12 @@ xfs_end_io_direct_write(
        ssize_t                 size,
        void                    *private)
 {
-        struct xfs_ioend        *ioend = iocb->private;
+        struct inode            *inode = file_inode(iocb->ki_filp);
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return;
        /*
         * While the generic direct I/O code updates the inode size, it does
@@ -1480,22 +1465,33 @@ xfs_end_io_direct_write(
         * end_io handler thinks the on-disk size is outside the in-core
         * size.  To prevent this just update it a little bit earlier here.
         */
-        if (offset + size > i_size_read(ioend->io_inode))
+        if (offset + size > i_size_read(inode))
-                i_size_write(ioend->io_inode, offset + size);
+                i_size_write(inode, offset + size);
        /*
-         * blockdev_direct_IO can return an error even after the I/O
+         * For direct I/O we do not know if we need to allocate blocks or not,
-         * completion handler was called.  Thus we need to protect
+         * so we can't preallocate an append transaction, as that results in
-         * against double-freeing.
+         * nested reservations and log space deadlocks. Hence allocate the
+         * transaction here. While this is sub-optimal and can block IO
+         * completion for some time, we're stuck with doing it this way until
+         * we can pass the ioend to the direct IO allocation callbacks and
+         * avoid nesting that way.
         */
-        iocb->private = NULL;
+        if (private && size > 0) {
+                xfs_iomap_write_unwritten(ip, offset, size);
-        ioend->io_offset = offset;
+        } else if (offset + size > ip->i_d.di_size) {
-        ioend->io_size = size;
+                struct xfs_trans        *tp;
-        if (private && size > 0)
+                int                     error;
-                ioend->io_type = XFS_IO_UNWRITTEN;
+                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+                if (error) {
+                        xfs_trans_cancel(tp, 0);
+                        return;
+                }
-        xfs_finish_ioend_sync(ioend);
+                xfs_setfilesize(ip, tp, offset, size);
+        }
 }
 STATIC ssize_t
@@ -1507,39 +1503,16 @@ xfs_vm_direct_IO(
 {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
-        struct xfs_ioend        *ioend = NULL;
-        ssize_t                 ret;
        if (rw & WRITE) {
-                size_t size = iov_iter_count(iter);
+                return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-                /*
-                 * We cannot preallocate a size update transaction here as we
-                 * don't know whether allocation is necessary or not. Hence we
-                 * can only tell IO completion that one is necessary if we are
-                 * not doing unwritten extent conversion.
-                 */
-                iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
-                if (offset + size > XFS_I(inode)->i_d.di_size)
-                        ioend->io_isdirect = 1;
-                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
                                            offset, xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL,
                                            DIO_ASYNC_EXTEND);
-                if (ret != -EIOCBQUEUED && iocb->private)
-                        goto out_destroy_ioend;
-        } else {
-                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-                                            offset, xfs_get_blocks_direct,
-                                            NULL, NULL, 0);
        }
+        return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-        return ret;
+                                    offset, xfs_get_blocks_direct,
+                                    NULL, NULL, 0);
-out_destroy_ioend:
-        xfs_destroy_ioend(ioend);
-        return ret;
 }
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f94dd459dff9..ac644e0137a4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool;
 * Types of I/O for bmap clustering and I/O completion tracking.
 */
 enum {
-        XFS_IO_DIRECT = 0,      /* special case for direct I/O ioends */
        XFS_IO_DELALLOC,        /* covers delalloc region */
        XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
        XFS_IO_OVERWRITE,       /* covers already allocated extent */
 };
 #define XFS_IO_TYPES \
-        { 0,                    "" }, \
        { XFS_IO_DELALLOC,              "delalloc" }, \
        { XFS_IO_UNWRITTEN,             "unwritten" }, \
        { XFS_IO_OVERWRITE,             "overwrite" }
@@ -45,7 +43,6 @@ typedef struct xfs_ioend {
        unsigned int            io_type;        /* delalloc / unwritten */
        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
-        unsigned int            io_isdirect : 1;/* direct I/O */
        struct inode            *io_inode;      /* file being written to */
        struct buffer_head      *io_buffer_head;/* buffer linked list head */
        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 2fdb72d2c908..736429a72a12 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -26,43 +26,8 @@ struct xfs_ifork;
 struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_bmalloca;
-/*
- * Argument structure for xfs_bmap_alloc.
- */
-struct xfs_bmalloca {
-        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
-        struct xfs_bmap_free    *flist; /* bmap freelist */
-        struct xfs_trans        *tp;    /* transaction pointer */
-        struct xfs_inode        *ip;    /* incore inode pointer */
-        struct xfs_bmbt_irec    prev;   /* extent before the new one */
-        struct xfs_bmbt_irec    got;    /* extent after, or delayed */
-        xfs_fileoff_t           offset; /* offset in file filling in */
-        xfs_extlen_t            length; /* i/o length asked/allocated */
-        xfs_fsblock_t           blkno;  /* starting block of new extent */
-        struct xfs_btree_cur    *cur;   /* btree cursor */
-        xfs_extnum_t            idx;    /* current extent index */
-        int                     nallocs;/* number of extents alloc'd */
-        int                     logflags;/* flags for transaction logging */
-        xfs_extlen_t            total;  /* total blocks needed for xaction */
-        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
-        xfs_extlen_t            minleft; /* amount must be left after alloc */
-        bool                    eof;    /* set if allocating past last extent */
-        bool                    wasdel; /* replacing a delayed allocation */
-        bool                    userdata;/* set if is user data */
-        bool                    aeof;   /* allocated space at eof */
-        bool                    conv;   /* overwriting unwritten extents */
-        int                     flags;
-        struct completion       *done;
-        struct work_struct      work;
-        int                     result;
-};
-int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
-                        int *committed);
 int     xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
 int     xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
                     int whichfork, int *eof);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
 static enum lru_status
 xfs_buftarg_wait_rele(
        struct list_head        *item,
+        struct list_lru_one     *lru,
        spinlock_t              *lru_lock,
        void                    *arg)
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
         */
        atomic_set(&bp->b_lru_ref, 0);
        bp->b_state |= XFS_BSTATE_DISPOSE;
-        list_move(item, dispose);
+        list_lru_isolate_move(lru, item, dispose);
        spin_unlock(&bp->b_lock);
        return LRU_REMOVED;
 }
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
 static enum lru_status
 xfs_buftarg_isolate(
        struct list_head        *item,
+        struct list_lru_one     *lru,
        spinlock_t              *lru_lock,
        void                    *arg)
 {
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
        }
        bp->b_state |= XFS_BSTATE_DISPOSE;
-        list_move(item, dispose);
+        list_lru_isolate_move(lru, item, dispose);
        spin_unlock(&bp->b_lock);
        return LRU_REMOVED;
 }
@@ -1583,10 +1585,9 @@ xfs_buftarg_shrink_scan(
                                        struct xfs_buftarg, bt_shrinker);
        LIST_HEAD(dispose);
        unsigned long           freed;
-        unsigned long           nr_to_scan = sc->nr_to_scan;
-        freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+        freed = list_lru_shrink_walk(&btp->bt_lru, sc,
-                                       &dispose, &nr_to_scan);
+                                     xfs_buftarg_isolate, &dispose);
        while (!list_empty(&dispose)) {
                struct xfs_buf *bp;
@@ -1605,7 +1606,7 @@ xfs_buftarg_shrink_count(
 {
        struct xfs_buftarg      *btp = container_of(shrink,
                                        struct xfs_buftarg, bt_shrinker);
-        return list_lru_count_node(&btp->bt_lru, sc->nid);
+        return list_lru_shrink_count(&btp->bt_lru, sc);
 }
 void
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3f9bd58edec7..507d96a57ac7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -319,6 +319,10 @@ xfs_buf_item_format(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
+        ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+               (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+                && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
        /*
         * If it is an inode buffer, transfer the in-memory state to the
@@ -535,7 +539,7 @@ xfs_buf_item_push(
        if ((bp->b_flags & XBF_WRITE_FAIL) &&
            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
                xfs_warn(bp->b_target->bt_mount,
-"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+"Detected failing async write on buffer block 0x%llx. Retrying async write.",
                         (long long)bp->b_bn);
        }
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c24c67e22a2a..2f536f33cd26 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -86,7 +86,7 @@ static inline void xfs_dqflock(xfs_dquot_t *dqp)
        wait_for_completion(&dqp->q_flush);
 }
-static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
 {
        return try_wait_for_completion(&dqp->q_flush);
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..1cdba95c78cb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -127,6 +127,42 @@ xfs_iozero(
        return (-status);
 }
+int
+xfs_update_prealloc_flags(
+        struct xfs_inode        *ip,
+        enum xfs_prealloc_flags flags)
+{
+        struct xfs_trans        *tp;
+        int                     error;
+        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        if (!(flags & XFS_PREALLOC_INVISIBLE)) {
+                ip->i_d.di_mode &= ~S_ISUID;
+                if (ip->i_d.di_mode & S_IXGRP)
+                        ip->i_d.di_mode &= ~S_ISGID;
+                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        }
+        if (flags & XFS_PREALLOC_SET)
+                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+        if (flags & XFS_PREALLOC_CLEAR)
+                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        if (flags & XFS_PREALLOC_SYNC)
+                xfs_trans_set_sync(tp);
+        return xfs_trans_commit(tp, 0);
+}
 /*
 * Fsync operations on directories are much simpler than on regular files,
 * as there is no file data to flush, and thus also no need for explicit
@@ -699,7 +735,7 @@ xfs_file_buffered_aio_write(
        iov_iter_truncate(from, count);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
 write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
@@ -784,8 +820,8 @@ xfs_file_fallocate(
 {
        struct inode            *inode = file_inode(file);
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_trans        *tp;
        long                    error;
+        enum xfs_prealloc_flags flags = 0;
        loff_t                  new_size = 0;
        if (!S_ISREG(inode->i_mode))
@@ -822,6 +858,8 @@ xfs_file_fallocate(
                if (error)
                        goto out_unlock;
        } else {
+                flags |= XFS_PREALLOC_SET;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    offset + len > i_size_read(inode)) {
                        new_size = offset + len;
@@ -839,28 +877,10 @@ xfs_file_fallocate(
                        goto out_unlock;
        }
-        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
-        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                goto out_unlock;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        ip->i_d.di_mode &= ~S_ISUID;
-        if (ip->i_d.di_mode & S_IXGRP)
-                ip->i_d.di_mode &= ~S_ISGID;
-        if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
-                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (file->f_flags & O_DSYNC)
-                xfs_trans_set_sync(tp);
+                flags |= XFS_PREALLOC_SYNC;
-        error = xfs_trans_commit(tp, 0);
+        error = xfs_update_prealloc_flags(ip, flags);
        if (error)
                goto out_unlock;
@@ -1384,5 +1404,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_vm_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fdc64220fcb0..fba6532efba4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -488,6 +488,7 @@ xfs_growfs_data_private(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
        if (dpct)
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
        if (error)
                return error;
@@ -541,7 +542,7 @@ xfs_growfs_data_private(
                        saved_error = error;
                        continue;
                }
-                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
@@ -756,37 +757,6 @@ out:
        return 0;
 }
-/*
- * Dump a transaction into the log that contains no real change. This is needed
- * to be able to make the log dirty or stamp the current tail LSN into the log
- * during the covering operation.
- *
- * We cannot use an inode here for this - that will push dirty state back up
- * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead and use a
- * synchronous transaction to ensure the superblock is immediately unpinned
- * and can be written back.
- */
-int
-xfs_fs_log_dummy(
-        xfs_mount_t     *mp)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        /* log the UUID because it is an unchanging field */
-        xfs_mod_sb(tp, XFS_SB_UUID);
-        xfs_trans_set_sync(tp);
-        return xfs_trans_commit(tp, 0);
-}
 int
 xfs_fs_goingdown(
        xfs_mount_t     *mp,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 41f804e740d7..daafa1f6d260 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1995,6 +1995,7 @@ xfs_iunlink(
        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
                (sizeof(xfs_agino_t) * bucket_index);
+        xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
        xfs_trans_log_buf(tp, agibp, offset,
                          (offset + sizeof(xfs_agino_t) - 1));
        return 0;
@@ -2086,6 +2087,7 @@ xfs_iunlink_remove(
                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
                offset = offsetof(xfs_agi_t, agi_unlinked) +
                        (sizeof(xfs_agino_t) * bucket_index);
+                xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
                xfs_trans_log_buf(tp, agibp, offset,
                                  (offset + sizeof(xfs_agino_t) - 1));
        } else {
@@ -2656,6 +2658,124 @@ xfs_sort_for_rename(
 }
 /*
+ * xfs_cross_rename()
+ *
+ * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
+ */
+STATIC int
+xfs_cross_rename(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp1,
+        struct xfs_name         *name1,
+        struct xfs_inode        *ip1,
+        struct xfs_inode        *dp2,
+        struct xfs_name         *name2,
+        struct xfs_inode        *ip2,
+        struct xfs_bmap_free    *free_list,
+        xfs_fsblock_t           *first_block,
+        int                     spaceres)
+{
+        int             error = 0;
+        int             ip1_flags = 0;
+        int             ip2_flags = 0;
+        int             dp2_flags = 0;
+        /* Swap inode number for dirent in first parent */
+        error = xfs_dir_replace(tp, dp1, name1,
+                                ip2->i_ino,
+                                first_block, free_list, spaceres);
+        if (error)
+                goto out;
+        /* Swap inode number for dirent in second parent */
+        error = xfs_dir_replace(tp, dp2, name2,
+                                ip1->i_ino,
+                                first_block, free_list, spaceres);
+        if (error)
+                goto out;
+        /*
+         * If we're renaming one or more directories across different parents,
+         * update the respective ".." entries (and link counts) to match the new
+         * parents.
+         */
+        if (dp1 != dp2) {
+                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                if (S_ISDIR(ip2->i_d.di_mode)) {
+                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+                                                dp1->i_ino, first_block,
+                                                free_list, spaceres);
+                        if (error)
+                                goto out;
+                        /* transfer ip2 ".." reference to dp1 */
+                        if (!S_ISDIR(ip1->i_d.di_mode)) {
+                                error = xfs_droplink(tp, dp2);
+                                if (error)
+                                        goto out;
+                                error = xfs_bumplink(tp, dp1);
+                                if (error)
+                                        goto out;
+                        }
+                        /*
+                         * Although ip1 isn't changed here, userspace needs
+                         * to be warned about the change, so that applications
+                         * relying on it (like backup ones), will properly
+                         * notify the change
+                         */
+                        ip1_flags |= XFS_ICHGTIME_CHG;
+                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                }
+                if (S_ISDIR(ip1->i_d.di_mode)) {
+                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+                                                dp2->i_ino, first_block,
+                                                free_list, spaceres);
+                        if (error)
+                                goto out;
+                        /* transfer ip1 ".." reference to dp2 */
+                        if (!S_ISDIR(ip2->i_d.di_mode)) {
+                                error = xfs_droplink(tp, dp1);
+                                if (error)
+                                        goto out;
+                                error = xfs_bumplink(tp, dp2);
+                                if (error)
+                                        goto out;
+                        }
+                        /*
+                         * Although ip2 isn't changed here, userspace needs
+                         * to be warned about the change, so that applications
+                         * relying on it (like backup ones), will properly
+                         * notify the change
+                         */
+                        ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                        ip2_flags |= XFS_ICHGTIME_CHG;
+                }
+        }
+        if (ip1_flags) {
+                xfs_trans_ichgtime(tp, ip1, ip1_flags);
+                xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+        }
+        if (ip2_flags) {
+                xfs_trans_ichgtime(tp, ip2, ip2_flags);
+                xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+        }
+        if (dp2_flags) {
+                xfs_trans_ichgtime(tp, dp2, dp2_flags);
+                xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+        }
+        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+out:
+        return error;
+}
+/*
 * xfs_rename
 */
 int
@@ -2665,7 +2785,8 @@ xfs_rename(
        xfs_inode_t     *src_ip,
        xfs_inode_t     *target_dp,
        struct xfs_name *target_name,
-        xfs_inode_t     *target_ip)
+        xfs_inode_t     *target_ip,
+        unsigned int    flags)
 {
        xfs_trans_t     *tp = NULL;
        xfs_mount_t     *mp = src_dp->i_mount;
@@ -2743,6 +2864,18 @@ xfs_rename(
        }
        /*
+         * Handle RENAME_EXCHANGE flags
+         */
+        if (flags & RENAME_EXCHANGE) {
+                error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
+                                         target_dp, target_name, target_ip,
+                                         &free_list, &first_block, spaceres);
+                if (error)
+                        goto abort_return;
+                goto finish_rename;
+        }
+        /*
         * Set up the target.
         */
        if (target_ip == NULL) {
@@ -2881,6 +3014,7 @@ xfs_rename(
        if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+finish_rename:
        /*
         * If this is a synchronous mount, make sure that the
         * rename transaction goes to disk before returning to
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4ed2ba9342dc..86cd6b39bed7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -338,7 +338,7 @@ int		xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int             xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                           struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                           struct xfs_name *target_name,
-                           struct xfs_inode *target_ip);
+                           struct xfs_inode *target_ip, unsigned int flags);
 void            xfs_ilock(xfs_inode_t *, uint);
 int             xfs_ilock_nowait(xfs_inode_t *, uint);
@@ -377,6 +377,15 @@ int		xfs_droplink(struct xfs_trans *, struct xfs_inode *);
 int             xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
 /* from xfs_file.c */
+enum xfs_prealloc_flags {
+        XFS_PREALLOC_SET        = (1 << 1),
+        XFS_PREALLOC_CLEAR      = (1 << 2),
+        XFS_PREALLOC_SYNC       = (1 << 3),
+        XFS_PREALLOC_INVISIBLE  = (1 << 4),
+};
+int             xfs_update_prealloc_flags(struct xfs_inode *,
+                        enum xfs_prealloc_flags);
 int             xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int             xfs_iozero(struct xfs_inode *, loff_t, size_t);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a1831980a68e..f7afb86c9148 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -606,11 +606,8 @@ xfs_ioc_space(
        unsigned int            cmd,
        xfs_flock64_t           *bf)
 {
-        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_trans        *tp;
        struct iattr            iattr;
-        bool                    setprealloc = false;
+        enum xfs_prealloc_flags flags = 0;
-        bool                    clrprealloc = false;
        int                     error;
        /*
@@ -630,6 +627,11 @@ xfs_ioc_space(
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
+        if (filp->f_flags & O_DSYNC)
+                flags |= XFS_PREALLOC_SYNC;
+        if (ioflags & XFS_IO_INVIS)     
+                flags |= XFS_PREALLOC_INVISIBLE;
        error = mnt_want_write_file(filp);
        if (error)
                return error;
@@ -673,25 +675,23 @@ xfs_ioc_space(
        }
        if (bf->l_start < 0 ||
-            bf->l_start > mp->m_super->s_maxbytes ||
+            bf->l_start > inode->i_sb->s_maxbytes ||
            bf->l_start + bf->l_len < 0 ||
-            bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
+            bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
                error = -EINVAL;
                goto out_unlock;
        }
        switch (cmd) {
        case XFS_IOC_ZERO_RANGE:
+                flags |= XFS_PREALLOC_SET;
                error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
-                if (!error)
-                        setprealloc = true;
                break;
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
+                flags |= XFS_PREALLOC_SET;
                error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
                                                XFS_BMAPI_PREALLOC);
-                if (!error)
-                        setprealloc = true;
                break;
        case XFS_IOC_UNRESVSP:
        case XFS_IOC_UNRESVSP64:
@@ -701,6 +701,7 @@ xfs_ioc_space(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP:
        case XFS_IOC_FREESP64:
+                flags |= XFS_PREALLOC_CLEAR;
                if (bf->l_start > XFS_ISIZE(ip)) {
                        error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
                                        bf->l_start - XFS_ISIZE(ip), 0);
@@ -712,8 +713,6 @@ xfs_ioc_space(
                iattr.ia_size = bf->l_start;
                error = xfs_setattr_size(ip, &iattr);
-                if (!error)
-                        clrprealloc = true;
                break;
        default:
                ASSERT(0);
@@ -723,32 +722,7 @@ xfs_ioc_space(
        if (error)
                goto out_unlock;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+        error = xfs_update_prealloc_flags(ip, flags);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                goto out_unlock;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        if (!(ioflags & XFS_IO_INVIS)) {
-                ip->i_d.di_mode &= ~S_ISUID;
-                if (ip->i_d.di_mode & S_IXGRP)
-                        ip->i_d.di_mode &= ~S_ISGID;
-                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        }
-        if (setprealloc)
-                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-        else if (clrprealloc)
-                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        if (filp->f_flags & O_DSYNC)
-                xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
 out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -1013,20 +987,182 @@ xfs_diflags_to_linux(
                inode->i_flags &= ~S_NOATIME;
 }
-#define FSX_PROJID      1
+static int
-#define FSX_EXTSIZE     2
+xfs_ioctl_setattr_xflags(
-#define FSX_XFLAGS      4
+        struct xfs_trans        *tp,
-#define FSX_NONBLOCK    8
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        /* Can't change realtime flag if any extents are allocated. */
+        if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+            XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+                return -EINVAL;
+        /* If realtime flag is set then must have realtime device */
+        if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+                if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
+                    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
+                        return -EINVAL;
+        }
+        /*
+         * Can't modify an immutable/append-only file unless
+         * we have appropriate permission.
+         */
+        if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
+             (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+            !capable(CAP_LINUX_IMMUTABLE))
+                return -EPERM;
+        xfs_set_diflags(ip, fa->fsx_xflags);
+        xfs_diflags_to_linux(ip);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        XFS_STATS_INC(xs_ig_attrchg);
+        return 0;
+}
+/*
+ * Set up the transaction structure for the setattr operation, checking that we
+ * have permission to do so. On success, return a clean transaction and the
+ * inode locked exclusively ready for further operation specific checks. On
+ * failure, return an error without modifying or locking the inode.
+ */
+static struct xfs_trans *
+xfs_ioctl_setattr_get_trans(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        int                     error;
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return ERR_PTR(-EROFS);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return ERR_PTR(-EIO);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
+                goto out_cancel;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        /*
+         * CAP_FOWNER overrides the following restrictions:
+         *
+         * The user ID of the calling process must be equal to the file owner
+         * ID, except in cases where the CAP_FSETID capability is applicable.
+         */
+        if (!inode_owner_or_capable(VFS_I(ip))) {
+                error = -EPERM;
+                goto out_cancel;
+        }
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(tp);
+        return tp;
+out_cancel:
+        xfs_trans_cancel(tp, 0);
+        return ERR_PTR(error);
+}
+/*
+ * extent size hint validation is somewhat cumbersome. Rules are:
+ *
+ * 1. extent size hint is only valid for directories and regular files
+ * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. can only be changed on regular files if no extents are allocated
+ * 5. can be changed on directories at any time
+ * 6. extsize hint of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ * 8. for non-realtime files, the extent size hint must be limited
+ *    to half the AG size to avoid alignment extending the extent beyond the
+ *    limits of the AG.
+ */
+static int
+xfs_ioctl_setattr_check_extsize(
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+                return -EINVAL;
+        if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+            !S_ISDIR(ip->i_d.di_mode))
+                return -EINVAL;
+        if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+            ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
+                return -EINVAL;
+        if (fa->fsx_extsize != 0) {
+                xfs_extlen_t    size;
+                xfs_fsblock_t   extsize_fsb;
+                extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                if (extsize_fsb > MAXEXTLEN)
+                        return -EINVAL;
+                if (XFS_IS_REALTIME_INODE(ip) ||
+                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                        size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+                } else {
+                        size = mp->m_sb.sb_blocksize;
+                        if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
+                                return -EINVAL;
+                }
+                if (fa->fsx_extsize % size)
+                        return -EINVAL;
+        } else
+                fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+        return 0;
+}
+static int
+xfs_ioctl_setattr_check_projid(
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        /* Disallow 32bit project ids if projid32bit feature is not enabled. */
+        if (fa->fsx_projid > (__uint16_t)-1 &&
+            !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+                return -EINVAL;
+        /*
+         * Project Quota ID state is only allowed to change from within the init
+         * namespace. Enforce that restriction only if we are trying to change
+         * the quota ID state. Everything else is allowed in user namespaces.
+         */
+        if (current_user_ns() == &init_user_ns)
+                return 0;
+        if (xfs_get_projid(ip) != fa->fsx_projid)
+                return -EINVAL;
+        if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+            (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
+                return -EINVAL;
+        return 0;
+}
 STATIC int
 xfs_ioctl_setattr(
        xfs_inode_t             *ip,
-        struct fsxattr          *fa,
+        struct fsxattr          *fa)
-        int                     mask)
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
-        unsigned int            lock_flags = 0;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *pdqp = NULL;
        struct xfs_dquot        *olddquot = NULL;
@@ -1034,17 +1170,9 @@ xfs_ioctl_setattr(
        trace_xfs_ioctl_setattr(ip);
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
+        code = xfs_ioctl_setattr_check_projid(ip, fa);
-                return -EROFS;
+        if (code)
-        if (XFS_FORCED_SHUTDOWN(mp))
+                return code;
-                return -EIO;
-        /*
-         * Disallow 32bit project ids when projid32bit feature is not enabled.
-         */
-        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
-                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
-                return -EINVAL;
        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1054,7 +1182,7 @@ xfs_ioctl_setattr(
         * If the IDs do change before we take the ilock, we're covered
         * because the i_*dquot fields will get updated anyway.
         */
-        if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+        if (XFS_IS_QUOTA_ON(mp)) {
                code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
                                         ip->i_d.di_gid, fa->fsx_projid,
                                         XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
@@ -1062,175 +1190,49 @@ xfs_ioctl_setattr(
                        return code;
        }
-        /*
+        tp = xfs_ioctl_setattr_get_trans(ip);
-         * For the other attributes, we acquire the inode lock and
+        if (IS_ERR(tp)) {
-         * first do an error checking pass.
+                code = PTR_ERR(tp);
-         */
+                goto error_free_dquots;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-        code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-        if (code)
-                goto error_return;
-        lock_flags = XFS_ILOCK_EXCL;
-        xfs_ilock(ip, lock_flags);
-        /*
-         * CAP_FOWNER overrides the following restrictions:
-         *
-         * The user ID of the calling process must be equal
-         * to the file owner ID, except in cases where the
-         * CAP_FSETID capability is applicable.
-         */
-        if (!inode_owner_or_capable(VFS_I(ip))) {
-                code = -EPERM;
-                goto error_return;
-        }
-        /*
-         * Do a quota reservation only if projid is actually going to change.
-         * Only allow changing of projid from init_user_ns since it is a
-         * non user namespace aware identifier.
-         */
-        if (mask & FSX_PROJID) {
-                if (current_user_ns() != &init_user_ns) {
-                        code = -EINVAL;
-                        goto error_return;
-                }
-                if (XFS_IS_QUOTA_RUNNING(mp) &&
-                    XFS_IS_PQUOTA_ON(mp) &&
-                    xfs_get_projid(ip) != fa->fsx_projid) {
-                        ASSERT(tp);
-                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
-                                                pdqp, capable(CAP_FOWNER) ?
-                                                XFS_QMOPT_FORCE_RES : 0);
-                        if (code)       /* out of quota */
-                                goto error_return;
-                }
        }
-        if (mask & FSX_EXTSIZE) {
-                /*
-                 * Can't change extent size if any extents are allocated.
-                 */
-                if (ip->i_d.di_nextents &&
-                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
-                     fa->fsx_extsize)) {
-                        code = -EINVAL; /* EFBIG? */
-                        goto error_return;
-                }
-                /*
+        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
-                 * Extent size must be a multiple of the appropriate block
+            xfs_get_projid(ip) != fa->fsx_projid) {
-                 * size, if set at all. It must also be smaller than the
+                code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
-                 * maximum extent size supported by the filesystem.
+                                capable(CAP_FOWNER) ?  XFS_QMOPT_FORCE_RES : 0);
-                 *
+                if (code)       /* out of quota */
-                 * Also, for non-realtime files, limit the extent size hint to
+                        goto error_trans_cancel;
-                 * half the size of the AGs in the filesystem so alignment
-                 * doesn't result in extents larger than an AG.
-                 */
-                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
-                        xfs_fsblock_t   extsize_fsb;
-                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
-                        if (extsize_fsb > MAXEXTLEN) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                        if (XFS_IS_REALTIME_INODE(ip) ||
-                            ((mask & FSX_XFLAGS) &&
-                            (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
-                                size = mp->m_sb.sb_rextsize <<
-                                       mp->m_sb.sb_blocklog;
-                        } else {
-                                size = mp->m_sb.sb_blocksize;
-                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
-                                        code = -EINVAL;
-                                        goto error_return;
-                                }
-                        }
-                        if (fa->fsx_extsize % size) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                }
        }
+        code = xfs_ioctl_setattr_check_extsize(ip, fa);
+        if (code)
+                goto error_trans_cancel;
-        if (mask & FSX_XFLAGS) {
+        code = xfs_ioctl_setattr_xflags(tp, ip, fa);
-                /*
+        if (code)
-                 * Can't change realtime flag if any extents are allocated.
+                goto error_trans_cancel;
-                 */
-                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-                    (XFS_IS_REALTIME_INODE(ip)) !=
-                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                        code = -EINVAL; /* EFBIG? */
-                        goto error_return;
-                }
-                /*
-                 * If realtime flag is set then must have realtime data.
-                 */
-                if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                        if ((mp->m_sb.sb_rblocks == 0) ||
-                            (mp->m_sb.sb_rextsize == 0) ||
-                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                }
-                /*
-                 * Can't modify an immutable/append-only file unless
-                 * we have appropriate permission.
-                 */
-                if ((ip->i_d.di_flags &
-                                (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
-                     (fa->fsx_xflags &
-                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
-                    !capable(CAP_LINUX_IMMUTABLE)) {
-                        code = -EPERM;
-                        goto error_return;
-                }
-        }
-        xfs_trans_ijoin(tp, ip, 0);
        /*
-         * Change file ownership.  Must be the owner or privileged.
+         * Change file ownership.  Must be the owner or privileged.  CAP_FSETID
+         * overrides the following restrictions:
+         *
+         * The set-user-ID and set-group-ID bits of a file will be cleared upon
+         * successful return from chown()
         */
-        if (mask & FSX_PROJID) {
-                /*
-                 * CAP_FSETID overrides the following restrictions:
-                 *
-                 * The set-user-ID and set-group-ID bits of a file will be
-                 * cleared upon successful return from chown()
-                 */
-                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-                    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
-                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-                /*
-                 * Change the ownerships and register quota modifications
-                 * in the transaction.
-                 */
-                if (xfs_get_projid(ip) != fa->fsx_projid) {
-                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
-                                olddquot = xfs_qm_vop_chown(tp, ip,
-                                                        &ip->i_pdquot, pdqp);
-                        }
-                        ASSERT(ip->i_d.di_version > 1);
-                        xfs_set_projid(ip, fa->fsx_projid);
-                }
-        }
+        if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+            !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+                ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-        if (mask & FSX_XFLAGS) {
+        /* Change the ownerships and register project quota modifications */
-                xfs_set_diflags(ip, fa->fsx_xflags);
+        if (xfs_get_projid(ip) != fa->fsx_projid) {
-                xfs_diflags_to_linux(ip);
+                if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+                        olddquot = xfs_qm_vop_chown(tp, ip,
+                                                &ip->i_pdquot, pdqp);
+                }
+                ASSERT(ip->i_d.di_version > 1);
+                xfs_set_projid(ip, fa->fsx_projid);
        }
        /*
@@ -1238,34 +1240,12 @@ xfs_ioctl_setattr(
         * extent size hint should be set on the inode. If no extent size flags
         * are set on the inode then unconditionally clear the extent size hint.
         */
-        if (mask & FSX_EXTSIZE) {
+        if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
-                int     extsize = 0;
+                ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+        else
-                if (ip->i_d.di_flags &
+                ip->i_d.di_extsize = 0;
-                                (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
-                        extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
-                ip->i_d.di_extsize = extsize;
-        }
-        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        XFS_STATS_INC(xs_ig_attrchg);
-        /*
-         * If this is a synchronous mount, make sure that the
-         * transaction goes to disk before returning to the user.
-         * This is slightly sub-optimal in that truncates require
-         * two sync transactions instead of one for wsync filesystems.
-         * One for the truncate and one for the timestamps since we
-         * don't want to change the timestamps unless we're sure the
-         * truncate worked.  Truncates are less than 1% of the laddis
-         * mix so this probably isn't worth the trouble to optimize.
-         */
-        if (mp->m_flags & XFS_MOUNT_WSYNC)
-                xfs_trans_set_sync(tp);
        code = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, lock_flags);
        /*
         * Release any dquot(s) the inode had kept before chown.
@@ -1276,12 +1256,11 @@ xfs_ioctl_setattr(
        return code;
- error_return:
+error_trans_cancel:
+        xfs_trans_cancel(tp, 0);
+error_free_dquots:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(pdqp);
-        xfs_trans_cancel(tp, 0);
-        if (lock_flags)
-                xfs_iunlock(ip, lock_flags);
        return code;
 }
@@ -1292,20 +1271,15 @@ xfs_ioc_fssetxattr(
        void                    __user *arg)
 {
        struct fsxattr          fa;
-        unsigned int            mask;
        int error;
        if (copy_from_user(&fa, arg, sizeof(fa)))
                return -EFAULT;
-        mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
-        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                mask |= FSX_NONBLOCK;
        error = mnt_want_write_file(filp);
        if (error)
                return error;
-        error = xfs_ioctl_setattr(ip, &fa, mask);
+        error = xfs_ioctl_setattr(ip, &fa);
        mnt_drop_write_file(filp);
        return error;
 }
@@ -1325,14 +1299,14 @@ xfs_ioc_getxflags(
 STATIC int
 xfs_ioc_setxflags(
-        xfs_inode_t             *ip,
+        struct xfs_inode        *ip,
        struct file             *filp,
        void                    __user *arg)
 {
+        struct xfs_trans        *tp;
        struct fsxattr          fa;
        unsigned int            flags;
-        unsigned int            mask;
+        int                     error;
-        int error;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
@@ -1342,15 +1316,26 @@ xfs_ioc_setxflags(
                      FS_SYNC_FL))
                return -EOPNOTSUPP;
-        mask = FSX_XFLAGS;
-        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                mask |= FSX_NONBLOCK;
        fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
        error = mnt_want_write_file(filp);
        if (error)
                return error;
-        error = xfs_ioctl_setattr(ip, &fa, mask);
+        tp = xfs_ioctl_setattr_get_trans(ip);
+        if (IS_ERR(tp)) {
+                error = PTR_ERR(tp);
+                goto out_drop_write;
+        }
+        error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                goto out_drop_write;
+        }
+        error = xfs_trans_commit(tp, 0);
+out_drop_write:
        mnt_drop_write_file(filp);
        return error;
 }
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ec6772866f3d..bfc7c7c8a0c8 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -423,7 +423,7 @@ xfs_compat_attrmulti_by_handle(
        ops = memdup_user(compat_ptr(am_hreq.ops), size);
        if (IS_ERR(ops)) {
-                error = -PTR_ERR(ops);
+                error = PTR_ERR(ops);
                goto out_dput;
        }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c980e2a5086b..ccb1dd0d509e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,7 +802,7 @@ int
 xfs_iomap_write_unwritten(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
-        size_t          count)
+        xfs_off_t       count)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 411fbb8919ef..8688e663d744 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,6 +27,6 @@ int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
                        struct xfs_bmbt_irec *);
 int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
                        struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c50311cae1b1..ce80eeb8faa4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -380,18 +380,27 @@ xfs_vn_rename(
        struct inode    *odir,
        struct dentry   *odentry,
        struct inode    *ndir,
-        struct dentry   *ndentry)
+        struct dentry   *ndentry,
+        unsigned int    flags)
 {
        struct inode    *new_inode = ndentry->d_inode;
+        int             omode = 0;
        struct xfs_name oname;
        struct xfs_name nname;
-        xfs_dentry_to_name(&oname, odentry, 0);
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        /* if we are exchanging files, we need to set i_mode of both files */
+        if (flags & RENAME_EXCHANGE)
+                omode = ndentry->d_inode->i_mode;
+        xfs_dentry_to_name(&oname, odentry, omode);
        xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
        return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-                          XFS_I(ndir), &nname, new_inode ?
+                          XFS_I(ndir), &nname,
-                                                XFS_I(new_inode) : NULL);
+                          new_inode ? XFS_I(new_inode) : NULL, flags);
 }
 /*
@@ -1144,7 +1153,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
         */
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
-        .rename                 = xfs_vn_rename,
+        .rename2                = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
@@ -1172,7 +1181,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
         */
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
-        .rename                 = xfs_vn_rename,
+        .rename2                = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e408bf5a3ff7..bcc7cfabb787 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -33,6 +33,7 @@
 #include "xfs_fsops.h"
 #include "xfs_cksum.h"
 #include "xfs_sysfs.h"
+#include "xfs_sb.h"
 kmem_zone_t     *xfs_log_ticket_zone;
@@ -1290,9 +1291,20 @@ xfs_log_worker(
        struct xfs_mount        *mp = log->l_mp;
        /* dgc: errors ignored - not fatal and nowhere to report them */
-        if (xfs_log_need_covered(mp))
+        if (xfs_log_need_covered(mp)) {
-                xfs_fs_log_dummy(mp);
+                /*
-        else
+                 * Dump a transaction into the log that contains no real change.
+                 * This is needed to stamp the current tail LSN into the log
+                 * during the covering operation.
+                 *
+                 * We cannot use an inode here for this - that will push dirty
+                 * state back up into the VFS and then periodic inode flushing
+                 * will prevent log covering from making progress. Hence we
+                 * synchronously log the superblock instead to ensure the
+                 * superblock is immediately unpinned and can be written back.
+                 */
+                xfs_sync_sb(mp, true);
+        } else
                xfs_log_force(mp, 0);
        /* start pushing all the metadata that is currently dirty */
@@ -1395,6 +1407,8 @@ xlog_alloc_log(
        ASSERT(xfs_buf_islocked(bp));
        xfs_buf_unlock(bp);
+        /* use high priority wq for log I/O completion */
+        bp->b_ioend_wq = mp->m_log_workqueue;
        bp->b_iodone = xlog_iodone;
        log->l_xbuf = bp;
@@ -1427,6 +1441,8 @@ xlog_alloc_log(
                ASSERT(xfs_buf_islocked(bp));
                xfs_buf_unlock(bp);
+                /* use high priority wq for log I/O completion */
+                bp->b_ioend_wq = mp->m_log_workqueue;
                bp->b_iodone = xlog_iodone;
                iclog->ic_bp = bp;
                iclog->ic_data = bp->b_addr;
@@ -1806,8 +1822,6 @@ xlog_sync(
        XFS_BUF_ZEROFLAGS(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_SYNCIO;
-        /* use high priority completion wq */
-        bp->b_ioend_wq = log->l_mp->m_log_workqueue;
        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
                bp->b_flags |= XBF_FUA;
@@ -1856,8 +1870,6 @@ xlog_sync(
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
-                /* use high priority completion wq */
-                bp->b_ioend_wq = log->l_mp->m_log_workqueue;
                ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
                ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -2027,7 +2039,7 @@ xlog_print_tic_res(
                "  total reg   = %u bytes (o/flow = %u bytes)\n"
                "  ophdrs      = %u (ophdr space = %u bytes)\n"
                "  ophdr + reg = %u bytes\n"
-                "  num regions = %u\n",
+                "  num regions = %u",
                ((ticket->t_trans_type <= 0 ||
                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d3d38836f87f..4fa80e63eea2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -408,11 +408,11 @@ xfs_update_alignment(xfs_mount_t *mp)
                if (xfs_sb_version_hasdalign(sbp)) {
                        if (sbp->sb_unit != mp->m_dalign) {
                                sbp->sb_unit = mp->m_dalign;
-                                mp->m_update_flags |= XFS_SB_UNIT;
+                                mp->m_update_sb = true;
                        }
                        if (sbp->sb_width != mp->m_swidth) {
                                sbp->sb_width = mp->m_swidth;
-                                mp->m_update_flags |= XFS_SB_WIDTH;
+                                mp->m_update_sb = true;
                        }
                } else {
                        xfs_warn(mp,
@@ -583,38 +583,19 @@ int
 xfs_mount_reset_sbqflags(
        struct xfs_mount        *mp)
 {
-        int                     error;
-        struct xfs_trans        *tp;
        mp->m_qflags = 0;
-        /*
+        /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
-         * It is OK to look at sb_qflags here in mount path,
-         * without m_sb_lock.
-         */
        if (mp->m_sb.sb_qflags == 0)
                return 0;
        spin_lock(&mp->m_sb_lock);
        mp->m_sb.sb_qflags = 0;
        spin_unlock(&mp->m_sb_lock);
-        /*
+        if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
-         * If the fs is readonly, let the incore superblock run
-         * with quotas off but don't flush the update out to disk
-         */
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return 0;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        return xfs_sync_sb(mp, false);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                xfs_alert(mp, "%s: Superblock update failed!", __func__);
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
-        return xfs_trans_commit(tp, 0);
 }
 __uint64_t
@@ -659,26 +640,25 @@ xfs_mountfs(
        xfs_sb_mount_common(mp, sbp);
        /*
-         * Check for a mismatched features2 values.  Older kernels
+         * Check for a mismatched features2 values.  Older kernels read & wrote
-         * read & wrote into the wrong sb offset for sb_features2
+         * into the wrong sb offset for sb_features2 on some platforms due to
-         * on some platforms due to xfs_sb_t not being 64bit size aligned
+         * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
-         * when sb_features2 was added, which made older superblock
+         * which made older superblock reading/writing routines swap it as a
-         * reading/writing routines swap it as a 64-bit value.
+         * 64-bit value.
         *
         * For backwards compatibility, we make both slots equal.
         *
-         * If we detect a mismatched field, we OR the set bits into the
+         * If we detect a mismatched field, we OR the set bits into the existing
-         * existing features2 field in case it has already been modified; we
+         * features2 field in case it has already been modified; we don't want
-         * don't want to lose any features.  We then update the bad location
+         * to lose any features.  We then update the bad location with the ORed
-         * with the ORed value so that older kernels will see any features2
+         * value so that older kernels will see any features2 flags. The
-         * flags, and mark the two fields as needing updates once the
+         * superblock writeback code ensures the new sb_features2 is copied to
-         * transaction subsystem is online.
+         * sb_bad_features2 before it is logged or written to disk.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
                xfs_warn(mp, "correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
-                sbp->sb_bad_features2 = sbp->sb_features2;
+                mp->m_update_sb = true;
-                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
                /*
                 * Re-check for ATTR2 in case it was found in bad_features2
@@ -692,17 +672,17 @@ xfs_mountfs(
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
           (mp->m_flags & XFS_MOUNT_NOATTR2)) {
                xfs_sb_version_removeattr2(&mp->m_sb);
-                mp->m_update_flags |= XFS_SB_FEATURES2;
+                mp->m_update_sb = true;
                /* update sb_versionnum for the clearing of the morebits */
                if (!sbp->sb_features2)
-                        mp->m_update_flags |= XFS_SB_VERSIONNUM;
+                        mp->m_update_sb = true;
        }
        /* always use v2 inodes by default now */
        if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
                mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
-                mp->m_update_flags |= XFS_SB_VERSIONNUM;
+                mp->m_update_sb = true;
        }
        /*
@@ -895,8 +875,8 @@ xfs_mountfs(
         * the next remount into writeable mode.  Otherwise we would never
         * perform the update e.g. for the root filesystem.
         */
-        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+        if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                error = xfs_sync_sb(mp, false);
                if (error) {
                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
@@ -1103,9 +1083,6 @@ xfs_fs_writable(
 int
 xfs_log_sbcount(xfs_mount_t *mp)
 {
-        xfs_trans_t     *tp;
-        int             error;
        /* allow this to proceed during the freeze sequence... */
        if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
                return 0;
@@ -1119,17 +1096,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
        if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
                return 0;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+        return xfs_sync_sb(mp, true);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        return error;
 }
 /*
@@ -1423,34 +1390,6 @@ xfs_freesb(
 }
 /*
- * Used to log changes to the superblock unit and width fields which could
- * be altered by the mount options, as well as any potential sb_features2
- * fixup. Only the first superblock is updated.
- */
-int
-xfs_mount_log_sb(
-        xfs_mount_t     *mp,
-        __int64_t       fields)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
-                         XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
-                         XFS_SB_VERSIONNUM));
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, fields);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
-/*
 * If the underlying (data/log/rt) device is readonly, there are some
 * operations that cannot proceed.
 */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22ccf69d4d3c..a5b2ff822653 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -162,8 +162,7 @@ typedef struct xfs_mount {
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        struct delayed_work     m_eofblocks_work; /* background eof blocks
                                                     trimming */
-        __int64_t               m_update_flags; /* sb flags we need to update
+        bool                    m_update_sb;    /* sb needs update in mount */
-                                                   on the next remount,rw */
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
        struct xfs_kobj         m_kobj;
@@ -378,7 +377,7 @@ extern void	xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
 extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
                        uint, int);
-extern int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+extern int      xfs_mount_log_sb(xfs_mount_t *);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 79fb19dd9c83..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
 static enum lru_status
 xfs_qm_dquot_isolate(
        struct list_head        *item,
+        struct list_lru_one     *lru,
        spinlock_t              *lru_lock,
        void                    *arg)
                __releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
                XFS_STATS_INC(xs_qm_dqwants);
                trace_xfs_dqreclaim_want(dqp);
-                list_del_init(&dqp->q_lru);
+                list_lru_isolate(lru, &dqp->q_lru);
                XFS_STATS_DEC(xs_qm_dquot_unused);
                return LRU_REMOVED;
        }
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
        xfs_dqunlock(dqp);
        ASSERT(dqp->q_nrefs == 0);
-        list_move_tail(&dqp->q_lru, &isol->dispose);
+        list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
        XFS_STATS_DEC(xs_qm_dquot_unused);
        trace_xfs_dqreclaim_done(dqp);
        XFS_STATS_INC(xs_qm_dqreclaims);
@@ -523,7 +524,6 @@ xfs_qm_shrink_scan(
        struct xfs_qm_isolate   isol;
        unsigned long           freed;
        int                     error;
-        unsigned long           nr_to_scan = sc->nr_to_scan;
        if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
                return 0;
@@ -531,8 +531,8 @@ xfs_qm_shrink_scan(
        INIT_LIST_HEAD(&isol.buffers);
        INIT_LIST_HEAD(&isol.dispose);
-        freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
+        freed = list_lru_shrink_walk(&qi->qi_lru, sc,
-                                        &nr_to_scan);
+                                     xfs_qm_dquot_isolate, &isol);
        error = xfs_buf_delwri_submit(&isol.buffers);
        if (error)
@@ -557,7 +557,7 @@ xfs_qm_shrink_count(
        struct xfs_quotainfo    *qi = container_of(shrink,
                                        struct xfs_quotainfo, qi_shrinker);
-        return list_lru_count_node(&qi->qi_lru, sc->nid);
+        return list_lru_shrink_count(&qi->qi_lru, sc);
 }
 /*
@@ -714,7 +714,6 @@ STATIC int
 xfs_qm_qino_alloc(
        xfs_mount_t     *mp,
        xfs_inode_t     **ip,
-        __int64_t       sbfields,
        uint            flags)
 {
        xfs_trans_t     *tp;
@@ -777,11 +776,6 @@ xfs_qm_qino_alloc(
        spin_lock(&mp->m_sb_lock);
        if (flags & XFS_QMOPT_SBVERSION) {
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
-                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                        XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
-                                (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                                 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
-                                 XFS_SB_QFLAGS));
                xfs_sb_version_addquota(&mp->m_sb);
                mp->m_sb.sb_uquotino = NULLFSINO;
@@ -798,7 +792,7 @@ xfs_qm_qino_alloc(
        else
                mp->m_sb.sb_pquotino = (*ip)->i_ino;
        spin_unlock(&mp->m_sb_lock);
-        xfs_mod_sb(tp, sbfields);
+        xfs_log_sb(tp);
        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
@@ -1451,7 +1445,7 @@ xfs_qm_mount_quotas(
        spin_unlock(&mp->m_sb_lock);
        if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-                if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+                if (xfs_sync_sb(mp, false)) {
                        /*
                         * We could only have been turning quotas off.
                         * We aren't in very good shape actually because
@@ -1482,7 +1476,6 @@ xfs_qm_init_quotainos(
        struct xfs_inode        *gip = NULL;
        struct xfs_inode        *pip = NULL;
        int                     error;
-        __int64_t               sbflags = 0;
        uint                    flags = 0;
        ASSERT(mp->m_quotainfo);
@@ -1517,9 +1510,6 @@ xfs_qm_init_quotainos(
                }
        } else {
                flags |= XFS_QMOPT_SBVERSION;
-                sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                            XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
-                            XFS_SB_QFLAGS);
        }
        /*
@@ -1530,7 +1520,6 @@ xfs_qm_init_quotainos(
         */
        if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
                error = xfs_qm_qino_alloc(mp, &uip,
-                                              sbflags | XFS_SB_UQUOTINO,
                                              flags | XFS_QMOPT_UQUOTA);
                if (error)
                        goto error_rele;
@@ -1539,7 +1528,6 @@ xfs_qm_init_quotainos(
        }
        if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
                error = xfs_qm_qino_alloc(mp, &gip,
-                                          sbflags | XFS_SB_GQUOTINO,
                                          flags | XFS_QMOPT_GQUOTA);
                if (error)
                        goto error_rele;
@@ -1548,7 +1536,6 @@ xfs_qm_init_quotainos(
        }
        if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
                error = xfs_qm_qino_alloc(mp, &pip,
-                                          sbflags | XFS_SB_PQUOTINO,
                                          flags | XFS_QMOPT_PQUOTA);
                if (error)
                        goto error_rele;
@@ -1587,32 +1574,6 @@ xfs_qm_dqfree_one(
        xfs_qm_dqdestroy(dqp);
 }
-/*
- * Start a transaction and write the incore superblock changes to
- * disk. flags parameter indicates which fields have changed.
- */
-int
-xfs_qm_write_sb_changes(
-        xfs_mount_t     *mp,
-        __int64_t       flags)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, flags);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
 /* --------------- utility functions for vnodeops ---------------- */
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 41f6c0b9d51c..0d4d3590cf85 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT     5
 extern void             xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int              xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 /* dquot stuff */
 extern void             xfs_qm_dqpurge_all(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index cb6168ec92c9..9b965db45800 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -91,8 +91,7 @@ xfs_qm_scall_quotaoff(
                mutex_unlock(&q->qi_quotaofflock);
                /* XXX what to do if error ? Revert back to old vals incore ? */
-                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+                return xfs_sync_sb(mp, false);
-                return error;
        }
        dqtype = 0;
@@ -313,7 +312,6 @@ xfs_qm_scall_quotaon(
 {
        int             error;
        uint            qf;
-        __int64_t       sbflags;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
@@ -321,30 +319,22 @@ xfs_qm_scall_quotaon(
         */
        flags &= ~(XFS_ALL_QUOTA_ACCT);
-        sbflags = 0;
        if (flags == 0) {
                xfs_debug(mp, "%s: zero flags, m_qflags=%x",
                        __func__, mp->m_qflags);
                return -EINVAL;
        }
-        /* No fs can turn on quotas with a delayed effect */
-        ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
        /*
         * Can't enforce without accounting. We check the superblock
         * qflags here instead of m_qflags because rootfs can have
         * quota acct on ondisk without m_qflags' knowing.
         */
-        if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+        if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
             (flags & XFS_UQUOTA_ENFD)) ||
-            ((flags & XFS_GQUOTA_ACCT) == 0 &&
+            ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
             (flags & XFS_GQUOTA_ENFD)) ||
-            ((flags & XFS_PQUOTA_ACCT) == 0 &&
+            ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
             (flags & XFS_PQUOTA_ENFD))) {
                xfs_debug(mp,
                        "%s: Can't enforce without acct, flags=%x sbflags=%x",
@@ -369,11 +359,11 @@ xfs_qm_scall_quotaon(
        /*
         * There's nothing to change if it's the same.
         */
-        if ((qf & flags) == flags && sbflags == 0)
+        if ((qf & flags) == flags)
                return -EEXIST;
-        sbflags |= XFS_SB_QFLAGS;
-        if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+        error = xfs_sync_sb(mp, false);
+        if (error)
                return error;
        /*
         * If we aren't trying to switch on quota enforcement, we are done.
@@ -383,8 +373,7 @@ xfs_qm_scall_quotaon(
             ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
             (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
             ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
-             (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
+             (mp->m_qflags & XFS_GQUOTA_ACCT)))
-            (flags & XFS_ALL_QUOTA_ENFD) == 0)
                return 0;
        if (! XFS_IS_QUOTA_RUNNING(mp))
@@ -421,20 +410,12 @@ xfs_qm_scall_getqstat(
        memset(out, 0, sizeof(fs_quota_stat_t));
        out->qs_version = FS_QSTAT_VERSION;
-        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
-                out->qs_uquota.qfs_ino = NULLFSINO;
-                out->qs_gquota.qfs_ino = NULLFSINO;
-                return 0;
-        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                        (XFS_ALL_QUOTA_ACCT|
                                                         XFS_ALL_QUOTA_ENFD));
-        if (q) {
+        uip = q->qi_uquotaip;
-                uip = q->qi_uquotaip;
+        gip = q->qi_gquotaip;
-                gip = q->qi_gquotaip;
+        pip = q->qi_pquotaip;
-                pip = q->qi_pquotaip;
-        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                        0, 0, &uip) == 0)
@@ -480,14 +461,13 @@ xfs_qm_scall_getqstat(
                if (temppqip)
                        IRELE(pip);
        }
-        if (q) {
+        out->qs_incoredqs = q->qi_dquots;
-                out->qs_incoredqs = q->qi_dquots;
+        out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_btimelimit = q->qi_btimelimit;
+        out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_itimelimit = q->qi_itimelimit;
+        out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+        out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_bwarnlimit = q->qi_bwarnlimit;
+        out->qs_iwarnlimit = q->qi_iwarnlimit;
-                out->qs_iwarnlimit = q->qi_iwarnlimit;
-        }
        return 0;
 }
@@ -508,13 +488,6 @@ xfs_qm_scall_getqstatv(
        bool                    tempgqip = false;
        bool                    temppqip = false;
-        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
-                out->qs_uquota.qfs_ino = NULLFSINO;
-                out->qs_gquota.qfs_ino = NULLFSINO;
-                out->qs_pquota.qfs_ino = NULLFSINO;
-                return 0;
-        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                        (XFS_ALL_QUOTA_ACCT|
                                                         XFS_ALL_QUOTA_ENFD));
@@ -522,11 +495,9 @@ xfs_qm_scall_getqstatv(
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
        out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
-        if (q) {
+        uip = q->qi_uquotaip;
-                uip = q->qi_uquotaip;
+        gip = q->qi_gquotaip;
-                gip = q->qi_gquotaip;
+        pip = q->qi_pquotaip;
-                pip = q->qi_pquotaip;
-        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                        0, 0, &uip) == 0)
@@ -561,14 +532,13 @@ xfs_qm_scall_getqstatv(
                if (temppqip)
                        IRELE(pip);
        }
-        if (q) {
+        out->qs_incoredqs = q->qi_dquots;
-                out->qs_incoredqs = q->qi_dquots;
+        out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_btimelimit = q->qi_btimelimit;
+        out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_itimelimit = q->qi_itimelimit;
+        out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+        out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_bwarnlimit = q->qi_bwarnlimit;
+        out->qs_iwarnlimit = q->qi_iwarnlimit;
-                out->qs_iwarnlimit = q->qi_iwarnlimit;
-        }
        return 0;
 }
@@ -800,7 +770,7 @@ xfs_qm_log_quotaoff(
        mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
        spin_unlock(&mp->m_sb_lock);
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        xfs_log_sb(tp);
        /*
         * We have to make sure that the transaction is secure on disk before we
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 801a84c1cdc3..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
        return xfs_qm_scall_getqstatv(mp, fqs);
 }
-STATIC int
+static unsigned int
-xfs_fs_set_xstate(
+xfs_quota_flags(unsigned int uflags)
-        struct super_block      *sb,
-        unsigned int            uflags,
-        int                     op)
 {
-        struct xfs_mount        *mp = XFS_M(sb);
+        unsigned int flags = 0;
-        unsigned int            flags = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return -EROFS;
-        if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
-                return -ENOSYS;
        if (uflags & FS_QUOTA_UDQ_ACCT)
                flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
        if (uflags & FS_QUOTA_PDQ_ENFD)
                flags |= XFS_PQUOTA_ENFD;
-        switch (op) {
+        return flags;
-        case Q_XQUOTAON:
+}
-                return xfs_qm_scall_quotaon(mp, flags);
-        case Q_XQUOTAOFF:
+STATIC int
-                if (!XFS_IS_QUOTA_ON(mp))
+xfs_quota_enable(
-                        return -EINVAL;
+        struct super_block      *sb,
-                return xfs_qm_scall_quotaoff(mp, flags);
+        unsigned int            uflags)
-        }
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+STATIC int
+xfs_quota_disable(
+        struct super_block      *sb,
+        unsigned int            uflags)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return -EINVAL;
-        return -EINVAL;
+        return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
 }
 STATIC int
@@ -166,7 +180,8 @@ xfs_fs_set_dqblk(
 const struct quotactl_ops xfs_quotactl_operations = {
        .get_xstatev            = xfs_fs_get_xstatev,
        .get_xstate             = xfs_fs_get_xstate,
-        .set_xstate             = xfs_fs_set_xstate,
+        .quota_enable           = xfs_quota_enable,
+        .quota_disable          = xfs_quota_disable,
        .rm_xquota              = xfs_fs_rm_xquota,
        .get_dqblk              = xfs_fs_get_dqblk,
        .set_dqblk              = xfs_fs_set_dqblk,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19cbda196369..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -685,7 +685,7 @@ xfs_blkdev_get(
                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
-                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
+                xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
        }
        return error;
@@ -1111,6 +1111,11 @@ xfs_fs_statfs(
                                        statp->f_files,
                                        mp->m_maxicount);
+        /* If sb_icount overshot maxicount, report actual allocation */
+        statp->f_files = max_t(typeof(statp->f_files),
+                                        statp->f_files,
+                                        sbp->sb_icount);
        /* make sure statp->f_ffree does not underflow */
        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        statp->f_ffree = max_t(__int64_t, ffree, 0);
@@ -1257,13 +1262,13 @@ xfs_fs_remount(
                 * If this is the first remount to writeable state we
                 * might have some superblock changes to update.
                 */
-                if (mp->m_update_flags) {
+                if (mp->m_update_sb) {
-                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                        error = xfs_sync_sb(mp, false);
                        if (error) {
                                xfs_warn(mp, "failed to write sb changes");
                                return error;
                        }
-                        mp->m_update_flags = 0;
+                        mp->m_update_sb = false;
                }
                /*
@@ -1293,8 +1298,9 @@ xfs_fs_remount(
 /*
 * Second stage of a freeze. The data is already frozen so we only
- * need to take care of the metadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done sync the superblock
- * record to dirty the log in case of a crash while frozen.
+ * to the log to dirty it in case of a crash while frozen. This ensures that we
+ * will recover the unlinked inode lists on the next mount.
 */
 STATIC int
 xfs_fs_freeze(
@@ -1304,7 +1310,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return xfs_fs_log_dummy(mp);
+        return xfs_sync_sb(mp, true);
 }
 STATIC int
@@ -1531,7 +1537,7 @@ xfs_fs_mount(
 static long
 xfs_fs_nr_cached_objects(
        struct super_block      *sb,
-        int                     nid)
+        struct shrink_control   *sc)
 {
        return xfs_reclaim_inodes_count(XFS_M(sb));
 }
@@ -1539,10 +1545,9 @@ xfs_fs_nr_cached_objects(
 static long
 xfs_fs_free_cached_objects(
        struct super_block      *sb,
-        long                    nr_to_scan,
+        struct shrink_control   *sc)
-        int                     nid)
 {
-        return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+        return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
 }
 static const struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 1743b9f8e23d..a0c8067cea6f 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -149,24 +149,6 @@ static struct ctl_table xfs_table[] = {
                .extra2         = &xfs_params.inherit_noatim.max
        },
        {
-                .procname       = "xfsbufd_centisecs",
-                .data           = &xfs_params.xfs_buf_timer.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &xfs_params.xfs_buf_timer.min,
-                .extra2         = &xfs_params.xfs_buf_timer.max
-        },
-        {
-                .procname       = "age_buffer_centisecs",
-                .data           = &xfs_params.xfs_buf_age.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &xfs_params.xfs_buf_age.min,
-                .extra2         = &xfs_params.xfs_buf_age.max
-        },
-        {
                .procname       = "inherit_nosymlinks",
                .data           = &xfs_params.inherit_nosym.val,
                .maxlen         = sizeof(int),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fa3135b9bf04..eb90cd59a0ec 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -472,6 +472,7 @@ xfs_trans_apply_sb_deltas(
                whole = 1;
        }
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
        if (whole)
                /*
                 * Log the whole thing, the fields are noncontiguous.
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 0a4d4ab6d9a9..75798412859a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -327,9 +327,10 @@ xfs_trans_read_buf_map(
                return -EIO;
        }
-        if (tp)
+        if (tp) {
                _xfs_trans_bjoin(tp, bp, 1);
-        trace_xfs_trans_read_buf(bp->b_fspriv);
+                trace_xfs_trans_read_buf(bp->b_fspriv);
+        }
        *bpp = bp;
        return 0;